[llvm] c3f01f1 - [X86] Add inst fixup for `unpckpd` -> `unpckqdq`.
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 9 22:17:34 PDT 2023
Author: Noah Goldstein
Date: 2023-04-10T00:16:57-05:00
New Revision: c3f01f13b10d708b9b7ff45a6ccc2f0c3462b3af
URL: https://github.com/llvm/llvm-project/commit/c3f01f13b10d708b9b7ff45a6ccc2f0c3462b3af
DIFF: https://github.com/llvm/llvm-project/commit/c3f01f13b10d708b9b7ff45a6ccc2f0c3462b3af.diff
LOG: [X86] Add inst fixup for `unpckpd` -> `unpckqdq`.
`unpckqdq` seems to be treated as a shuffle from bypass delay
perspective (which makes sense it appears to have shared shuffle units
for all micro-arch).
`unpckqdq` is slightly preferable to `shufpd` as it saves 1-byte of
code size and can be used to replace the micro-fused `rm` version. So,
if the target has no bypass delay, we should do `unpckpd` ->
`unpckqdq` instead of `shufpd.
Reviewed By: pengfei
Differential Revision: https://reviews.llvm.org/D147728
Added:
Modified:
llvm/lib/Target/X86/X86FixupInstTuning.cpp
llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 599d7499125a..6f63cca8c7d4 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -154,14 +154,18 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};
- // `vunpcklpd/vmovlhps r, r` -> `vshufpd r, r, 0x00`
- // `vunpckhpd/vmovlhps r, r` -> `vshufpd r, r, 0xff`
- // `vunpcklpd r, r, k` -> `vshufpd r, r, 0x00`
- // `vunpckhpd r, r, k` -> `vshufpd r, r, 0xff`
- // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with
- // `vunpck{l|h}pd` as it uses less code size.
- // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS`
- // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost.
+ // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
+ // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
+ // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
+ // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
+ // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
+ // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
+ // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
+ // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
+ // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
+ // -> `vunpck{l|h}qdq`
+ // 2) If `vshufpd` faster than `vunpck{l|h}pd`
+ // -> `vshufpd`
auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
return false;
@@ -171,13 +175,34 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};
- auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool {
+ auto ProcessUNPCKPDToIntDomain = [&](unsigned NewOpc) -> bool {
+ // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
+ // downside to the integer unpck, but if someone doesn't specify exact
+ // target we won't find it faster.
+ if (!ST->hasNoDomainDelayShuffle() ||
+ !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
+ return false;
+ MI.setDesc(TII->get(NewOpc));
+ return true;
+ };
+
+ auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
+ unsigned NewOpc) -> bool {
+ if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain))
+ return true;
return ProcessUNPCKPD(NewOpc, 0x00);
};
- auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool {
+ auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
+ unsigned NewOpc) -> bool {
+ if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain))
+ return true;
return ProcessUNPCKPD(NewOpc, 0xff);
};
+ auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
+ return ProcessUNPCKPDToIntDomain(NewOpcIntDomain);
+ };
+
switch (Opc) {
case X86::VPERMILPSri:
return ProcessVPERMILPSri(X86::VSHUFPSrri);
@@ -226,64 +251,106 @@ bool X86FixupInstTuningPass::processInstruction(
case X86::VPERMILPSZmik:
return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
- // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to
- // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as
- // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also
- // handle the `mr` case. ICL doesn't have a domain penalty for replacing
- // float unpck -> int unpck, but at this time, I haven't verified the set of
- // processors where its safe.
case X86::MOVLHPSrr:
case X86::UNPCKLPDrr:
- return ProcessUNPCKLPDrr(X86::SHUFPDrri);
+ return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
case X86::VMOVLHPSrr:
case X86::VUNPCKLPDrr:
- return ProcessUNPCKLPDrr(X86::VSHUFPDrri);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
case X86::VUNPCKLPDYrr:
- return ProcessUNPCKLPDrr(X86::VSHUFPDYrri);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
// VMOVLHPS is always 128 bits.
case X86::VMOVLHPSZrr:
case X86::VUNPCKLPDZ128rr:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rri);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
case X86::VUNPCKLPDZ256rr:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rri);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
case X86::VUNPCKLPDZrr:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZrri);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
case X86::VUNPCKLPDZ128rrk:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrik);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
case X86::VUNPCKLPDZ256rrk:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrik);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
case X86::VUNPCKLPDZrrk:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZrrik);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
case X86::VUNPCKLPDZ128rrkz:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrikz);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
case X86::VUNPCKLPDZ256rrkz:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrikz);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
case X86::VUNPCKLPDZrrkz:
- return ProcessUNPCKLPDrr(X86::VSHUFPDZrrikz);
+ return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
case X86::UNPCKHPDrr:
- return ProcessUNPCKHPDrr(X86::SHUFPDrri);
+ return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
case X86::VUNPCKHPDrr:
- return ProcessUNPCKHPDrr(X86::VSHUFPDrri);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
case X86::VUNPCKHPDYrr:
- return ProcessUNPCKHPDrr(X86::VSHUFPDYrri);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
case X86::VUNPCKHPDZ128rr:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rri);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
case X86::VUNPCKHPDZ256rr:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rri);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
case X86::VUNPCKHPDZrr:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZrri);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
case X86::VUNPCKHPDZ128rrk:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrik);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
case X86::VUNPCKHPDZ256rrk:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrik);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
case X86::VUNPCKHPDZrrk:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZrrik);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
case X86::VUNPCKHPDZ128rrkz:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrikz);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
case X86::VUNPCKHPDZ256rrkz:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrikz);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
case X86::VUNPCKHPDZrrkz:
- return ProcessUNPCKHPDrr(X86::VSHUFPDZrrikz);
+ return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
+ case X86::UNPCKLPDrm:
+ return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
+ case X86::VUNPCKLPDrm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
+ case X86::VUNPCKLPDYrm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
+ case X86::VUNPCKLPDZ128rm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
+ case X86::VUNPCKLPDZ256rm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
+ case X86::VUNPCKLPDZrm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
+ case X86::VUNPCKLPDZ128rmk:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
+ case X86::VUNPCKLPDZ256rmk:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
+ case X86::VUNPCKLPDZrmk:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
+ case X86::VUNPCKLPDZ128rmkz:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
+ case X86::VUNPCKLPDZ256rmkz:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
+ case X86::VUNPCKLPDZrmkz:
+ return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
+ case X86::UNPCKHPDrm:
+ return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
+ case X86::VUNPCKHPDrm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
+ case X86::VUNPCKHPDYrm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
+ case X86::VUNPCKHPDZ128rm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
+ case X86::VUNPCKHPDZ256rm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
+ case X86::VUNPCKHPDZrm:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
+ case X86::VUNPCKHPDZ128rmk:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
+ case X86::VUNPCKHPDZ256rmk:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
+ case X86::VUNPCKHPDZrmk:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
+ case X86::VUNPCKHPDZ128rmkz:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
+ case X86::VUNPCKHPDZ256rmkz:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
+ case X86::VUNPCKHPDZrmkz:
+ return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);
default:
return false;
}
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index bc137a59a9f8..4a160bc9debc 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -31,10 +31,15 @@ define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounw
; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKLPDYrr:
; CHECK-V4: # %bb.0:
@@ -60,10 +65,15 @@ define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounw
; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKHPDYrr:
; CHECK-V4: # %bb.0:
@@ -89,10 +99,15 @@ define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwi
; CHECK-SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKLPDrr:
; CHECK-V4: # %bb.0:
@@ -118,10 +133,15 @@ define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwi
; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKHPDrr:
; CHECK-V4: # %bb.0:
@@ -172,11 +192,17 @@ define <4 x double> @transform_VUNPCKLPDYrrkz(<4 x double> %a, <4 x double> %b,
; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrkz:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKLPDYrrkz:
; CHECK-V4: # %bb.0:
@@ -208,11 +234,17 @@ define <4 x double> @transform_VUNPCKHPDYrrkz(<4 x double> %a, <4 x double> %b,
; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrkz:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKHPDYrrkz:
; CHECK-V4: # %bb.0:
@@ -244,11 +276,17 @@ define <2 x double> @transform_VUNPCKLPDrrkz(<2 x double> %a, <2 x double> %b, i
; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrrkz:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKLPDrrkz:
; CHECK-V4: # %bb.0:
@@ -280,11 +318,17 @@ define <2 x double> @transform_VUNPCKHPDrrkz(<2 x double> %a, <2 x double> %b, i
; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrrkz:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKHPDrrkz:
; CHECK-V4: # %bb.0:
@@ -343,12 +387,19 @@ define <4 x double> @transform_VUNPCKLPDYrrk(<4 x double> %a, <4 x double> %b, <
; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrk:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKLPDYrrk:
; CHECK-V4: # %bb.0:
@@ -384,12 +435,19 @@ define <4 x double> @transform_VUNPCKHPDYrrk(<4 x double> %a, <4 x double> %b, <
; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrk:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKHPDYrrk:
; CHECK-V4: # %bb.0:
@@ -425,12 +483,19 @@ define <2 x double> @transform_VUNPCKLPDrrk(<2 x double> %a, <2 x double> %b, <2
; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrrk:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKLPDrrk:
; CHECK-V4: # %bb.0:
@@ -466,12 +531,19 @@ define <2 x double> @transform_VUNPCKHPDrrk(<2 x double> %a, <2 x double> %b, <2
; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0
; CHECK-SKX-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrrk:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: kmovd %edi, %k1
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-V4-LABEL: transform_VUNPCKHPDrrk:
; CHECK-V4: # %bb.0:
@@ -520,40 +592,140 @@ define <16 x float> @transform_VUNPCKHPDZrm(<16 x float> %a, ptr %pb) nounwind {
}
define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ZNVER4-NEXT: retq
%b = load <8 x float>, ptr %pb
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
ret <8 x float> %shufp
}
define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ZNVER4-NEXT: retq
%b = load <8 x float>, ptr %pb
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
ret <8 x float> %shufp
}
define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrm:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT: retq
%b = load <4 x float>, ptr %pb
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shufp
}
define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrm:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ZNVER4-NEXT: retq
%b = load <4 x float>, ptr %pb
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x float> %shufp
@@ -586,11 +758,41 @@ define <8 x double> @transform_VUNPCKHPDZrmkz(<8 x double> %a, ptr %pb, i8 %mask
}
define <4 x double> @transform_VUNPCKLPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrmkz:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%b = load <4 x double>, ptr %pb
%shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -599,11 +801,41 @@ define <4 x double> @transform_VUNPCKLPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask
}
define <4 x double> @transform_VUNPCKHPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrmkz:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%b = load <4 x double>, ptr %pb
%shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -612,11 +844,41 @@ define <4 x double> @transform_VUNPCKHPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask
}
define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrmkz:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i2 %mask_int to <2 x i1>
%b = load <2 x double>, ptr %pb
%shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
@@ -625,11 +887,41 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_
}
define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i2 %mask_int to <2 x i1>
%b = load <2 x double>, ptr %pb
%shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
@@ -666,12 +958,47 @@ define <8 x double> @transform_VUNPCKHPDZrmk(<8 x double> %a, ptr %pb, <8 x doub
}
define <4 x double> @transform_VUNPCKLPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrmk:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SKX-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-V4-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX512-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ZNVER4-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%b = load <4 x double>, ptr %pb
%shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -680,12 +1007,47 @@ define <4 x double> @transform_VUNPCKLPDYrmk(<4 x double> %a, ptr %pb, <4 x doub
}
define <4 x double> @transform_VUNPCKHPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrmk:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SKX-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-V4-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX512-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ZNVER4-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%b = load <4 x double>, ptr %pb
%shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -694,12 +1056,47 @@ define <4 x double> @transform_VUNPCKHPDYrmk(<4 x double> %a, ptr %pb, <4 x doub
}
define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrmk:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
-; CHECK-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i2 %mask_int to <2 x i1>
%b = load <2 x double>, ptr %pb
%shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
@@ -708,12 +1105,47 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
}
define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrmk:
-; CHECK: # %bb.0:
-; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: kmovd %esi, %k1
+; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: kmovd %esi, %k1
+; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: kmovd %esi, %k1
+; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i2 %mask_int to <2 x i1>
%b = load <2 x double>, ptr %pb
%shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
@@ -721,5 +1153,4 @@ define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
ret <2 x double> %res
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ICX-BYPASS-DELAY: {{.*}}
-; CHECK-ICX-NO-BYPASS-DELAY: {{.*}}
+; CHECK-ICX: {{.*}}
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
index 481ecdb256ce..6940c33c9d32 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
@@ -12,10 +12,15 @@ define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounw
; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-AVX2-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-SNB-LABEL: transform_VUNPCKLPDYrr:
; CHECK-SNB: # %bb.0:
@@ -31,10 +36,15 @@ define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounw
; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-AVX2-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
;
; CHECK-SNB-LABEL: transform_VUNPCKHPDYrr:
; CHECK-SNB: # %bb.0:
@@ -50,15 +60,25 @@ define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwi
; CHECK-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-AVX2-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
;
-; CHECK-SNB-LABEL: transform_VUNPCKLPDrr:
-; CHECK-SNB: # %bb.0:
-; CHECK-SNB-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-SNB-NEXT: retq
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-SNB-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SNB-BYPASS-DELAY-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shufp
}
@@ -69,62 +89,140 @@ define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwi
; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-AVX2-NEXT: retq
;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
-; CHECK-ICX: # %bb.0:
-; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT: retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
;
-; CHECK-SNB-LABEL: transform_VUNPCKHPDrr:
-; CHECK-SNB: # %bb.0:
-; CHECK-SNB-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-SNB-NEXT: retq
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-SNB-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SNB-BYPASS-DELAY-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x float> %shufp
}
define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-SNB: # %bb.0:
+; CHECK-SNB-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SNB-NEXT: retq
%b = load <8 x float>, ptr %pb
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
ret <8 x float> %shufp
}
define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-SNB: # %bb.0:
+; CHECK-SNB-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SNB-NEXT: retq
%b = load <8 x float>, ptr %pb
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
ret <8 x float> %shufp
}
define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKLPDrm:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-SNB-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-SNB-BYPASS-DELAY-NEXT: retq
%b = load <4 x float>, ptr %pb
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shufp
}
define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-NEXT: retq
+; CHECK-AVX2-LABEL: transform_VUNPCKHPDrm:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-BYPASS-DELAY: # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-SNB-BYPASS-DELAY: # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-BYPASS-DELAY-NEXT: retq
%b = load <4 x float>, ptr %pb
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x float> %shufp
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ICX-BYPASS-DELAY: {{.*}}
-; CHECK-ICX-NO-BYPASS-DELAY: {{.*}}
+; CHECK: {{.*}}
+; CHECK-ICX: {{.*}}
; CHECK-SKL: {{.*}}
-; CHECK-SNB-BYPASS-DELAY: {{.*}}
-; CHECK-SNB-NO-BYPASS-DELAY: {{.*}}
; CHECK-V3: {{.*}}
More information about the llvm-commits
mailing list