[llvm] c3f01f1 - [X86] Add inst fixup for `unpckpd` -> `unpckqdq`.

Noah Goldstein via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 9 22:17:34 PDT 2023


Author: Noah Goldstein
Date: 2023-04-10T00:16:57-05:00
New Revision: c3f01f13b10d708b9b7ff45a6ccc2f0c3462b3af

URL: https://github.com/llvm/llvm-project/commit/c3f01f13b10d708b9b7ff45a6ccc2f0c3462b3af
DIFF: https://github.com/llvm/llvm-project/commit/c3f01f13b10d708b9b7ff45a6ccc2f0c3462b3af.diff

LOG: [X86] Add inst fixup for `unpckpd` -> `unpckqdq`.

`unpckqdq` seems to be treated as a shuffle from bypass delay
perspective (which makes sense it appears to have shared shuffle units
for all micro-arch).

`unpckqdq` is slightly preferable to `shufpd` as it saves 1-byte of
code size and can be used to replace the micro-fused `rm` version. So,
if the target has no bypass delay, we should do `unpckpd` ->
`unpckqdq` instead of `shufpd.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D147728

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86FixupInstTuning.cpp
    llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
    llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 599d7499125a..6f63cca8c7d4 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -154,14 +154,18 @@ bool X86FixupInstTuningPass::processInstruction(
     return true;
   };
 
-  // `vunpcklpd/vmovlhps r, r` -> `vshufpd r, r, 0x00`
-  // `vunpckhpd/vmovlhps r, r` -> `vshufpd r, r, 0xff`
-  // `vunpcklpd r, r, k` -> `vshufpd r, r, 0x00`
-  // `vunpckhpd r, r, k` -> `vshufpd r, r, 0xff`
-  // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with
-  // `vunpck{l|h}pd` as it uses less code size.
-  // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS`
-  // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost.
+  // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
+  // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
+  // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
+  // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
+  // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
+  // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
+  // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
+  // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
+  // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
+  //        -> `vunpck{l|h}qdq`
+  // 2) If `vshufpd` faster than `vunpck{l|h}pd`
+  //        -> `vshufpd`
   auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
     if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
@@ -171,13 +175,34 @@ bool X86FixupInstTuningPass::processInstruction(
     return true;
   };
 
-  auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool {
+  auto ProcessUNPCKPDToIntDomain = [&](unsigned NewOpc) -> bool {
+    // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
+    // downside to the integer unpck, but if someone doesn't specify exact
+    // target we won't find it faster.
+    if (!ST->hasNoDomainDelayShuffle() ||
+        !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
+      return false;
+    MI.setDesc(TII->get(NewOpc));
+    return true;
+  };
+
+  auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
+                               unsigned NewOpc) -> bool {
+    if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain))
+      return true;
     return ProcessUNPCKPD(NewOpc, 0x00);
   };
-  auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool {
+  auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
+                               unsigned NewOpc) -> bool {
+    if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain))
+      return true;
     return ProcessUNPCKPD(NewOpc, 0xff);
   };
 
+  auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
+    return ProcessUNPCKPDToIntDomain(NewOpcIntDomain);
+  };
+
   switch (Opc) {
   case X86::VPERMILPSri:
     return ProcessVPERMILPSri(X86::VSHUFPSrri);
@@ -226,64 +251,106 @@ bool X86FixupInstTuningPass::processInstruction(
   case X86::VPERMILPSZmik:
     return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
 
-    // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to
-    // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as
-    // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also
-    // handle the `mr` case. ICL doesn't have a domain penalty for replacing
-    // float unpck -> int unpck, but at this time, I haven't verified the set of
-    // processors where its safe.
   case X86::MOVLHPSrr:
   case X86::UNPCKLPDrr:
-    return ProcessUNPCKLPDrr(X86::SHUFPDrri);
+    return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
   case X86::VMOVLHPSrr:
   case X86::VUNPCKLPDrr:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDrri);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
   case X86::VUNPCKLPDYrr:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDYrri);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
     // VMOVLHPS is always 128 bits.
   case X86::VMOVLHPSZrr:
   case X86::VUNPCKLPDZ128rr:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rri);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
   case X86::VUNPCKLPDZ256rr:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rri);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
   case X86::VUNPCKLPDZrr:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZrri);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
   case X86::VUNPCKLPDZ128rrk:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrik);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
   case X86::VUNPCKLPDZ256rrk:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrik);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
   case X86::VUNPCKLPDZrrk:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZrrik);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
   case X86::VUNPCKLPDZ128rrkz:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrikz);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
   case X86::VUNPCKLPDZ256rrkz:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrikz);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
   case X86::VUNPCKLPDZrrkz:
-    return ProcessUNPCKLPDrr(X86::VSHUFPDZrrikz);
+    return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
   case X86::UNPCKHPDrr:
-    return ProcessUNPCKHPDrr(X86::SHUFPDrri);
+    return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
   case X86::VUNPCKHPDrr:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDrri);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
   case X86::VUNPCKHPDYrr:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDYrri);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
   case X86::VUNPCKHPDZ128rr:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rri);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
   case X86::VUNPCKHPDZ256rr:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rri);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
   case X86::VUNPCKHPDZrr:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZrri);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
   case X86::VUNPCKHPDZ128rrk:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrik);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
   case X86::VUNPCKHPDZ256rrk:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrik);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
   case X86::VUNPCKHPDZrrk:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZrrik);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
   case X86::VUNPCKHPDZ128rrkz:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrikz);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
   case X86::VUNPCKHPDZ256rrkz:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrikz);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
   case X86::VUNPCKHPDZrrkz:
-    return ProcessUNPCKHPDrr(X86::VSHUFPDZrrikz);
+    return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
+  case X86::UNPCKLPDrm:
+    return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
+  case X86::VUNPCKLPDrm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
+  case X86::VUNPCKLPDYrm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
+  case X86::VUNPCKLPDZ128rm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
+  case X86::VUNPCKLPDZ256rm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
+  case X86::VUNPCKLPDZrm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
+  case X86::VUNPCKLPDZ128rmk:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
+  case X86::VUNPCKLPDZ256rmk:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
+  case X86::VUNPCKLPDZrmk:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
+  case X86::VUNPCKLPDZ128rmkz:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
+  case X86::VUNPCKLPDZ256rmkz:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
+  case X86::VUNPCKLPDZrmkz:
+    return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
+  case X86::UNPCKHPDrm:
+    return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
+  case X86::VUNPCKHPDrm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
+  case X86::VUNPCKHPDYrm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
+  case X86::VUNPCKHPDZ128rm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
+  case X86::VUNPCKHPDZ256rm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
+  case X86::VUNPCKHPDZrm:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
+  case X86::VUNPCKHPDZ128rmk:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
+  case X86::VUNPCKHPDZ256rmk:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
+  case X86::VUNPCKHPDZrmk:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
+  case X86::VUNPCKHPDZ128rmkz:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
+  case X86::VUNPCKHPDZ256rmkz:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
+  case X86::VUNPCKHPDZrmkz:
+    return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);
   default:
     return false;
   }

diff  --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index bc137a59a9f8..4a160bc9debc 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -31,10 +31,15 @@ define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounw
 ; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrr:
 ; CHECK-V4:       # %bb.0:
@@ -60,10 +65,15 @@ define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounw
 ; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrr:
 ; CHECK-V4:       # %bb.0:
@@ -89,10 +99,15 @@ define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwi
 ; CHECK-SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrr:
 ; CHECK-V4:       # %bb.0:
@@ -118,10 +133,15 @@ define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwi
 ; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKHPDrr:
 ; CHECK-V4:       # %bb.0:
@@ -172,11 +192,17 @@ define <4 x double> @transform_VUNPCKLPDYrrkz(<4 x double> %a, <4 x double> %b,
 ; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrkz:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrrkz:
 ; CHECK-V4:       # %bb.0:
@@ -208,11 +234,17 @@ define <4 x double> @transform_VUNPCKHPDYrrkz(<4 x double> %a, <4 x double> %b,
 ; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrkz:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrrkz:
 ; CHECK-V4:       # %bb.0:
@@ -244,11 +276,17 @@ define <2 x double> @transform_VUNPCKLPDrrkz(<2 x double> %a, <2 x double> %b, i
 ; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrrkz:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrrkz:
 ; CHECK-V4:       # %bb.0:
@@ -280,11 +318,17 @@ define <2 x double> @transform_VUNPCKHPDrrkz(<2 x double> %a, <2 x double> %b, i
 ; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrrkz:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKHPDrrkz:
 ; CHECK-V4:       # %bb.0:
@@ -343,12 +387,19 @@ define <4 x double> @transform_VUNPCKLPDYrrk(<4 x double> %a, <4 x double> %b, <
 ; CHECK-SKX-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrk:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT:    vmovapd %ymm2, %ymm0
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %ymm2, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %ymm2, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrrk:
 ; CHECK-V4:       # %bb.0:
@@ -384,12 +435,19 @@ define <4 x double> @transform_VUNPCKHPDYrrk(<4 x double> %a, <4 x double> %b, <
 ; CHECK-SKX-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrk:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT:    vmovapd %ymm2, %ymm0
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %ymm2, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %ymm2, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrrk:
 ; CHECK-V4:       # %bb.0:
@@ -425,12 +483,19 @@ define <2 x double> @transform_VUNPCKLPDrrk(<2 x double> %a, <2 x double> %b, <2
 ; CHECK-SKX-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrrk:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT:    vmovapd %xmm2, %xmm0
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm2, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm2, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrrk:
 ; CHECK-V4:       # %bb.0:
@@ -466,12 +531,19 @@ define <2 x double> @transform_VUNPCKHPDrrk(<2 x double> %a, <2 x double> %b, <2
 ; CHECK-SKX-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-SKX-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrrk:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %edi, %k1
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT:    vmovapd %xmm2, %xmm0
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm2, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %edi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm2, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKHPDrrk:
 ; CHECK-V4:       # %bb.0:
@@ -520,40 +592,140 @@ define <16 x float> @transform_VUNPCKHPDZrm(<16 x float> %a, ptr %pb) nounwind {
 }
 
 define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ZNVER4-NEXT:    retq
   %b = load <8 x float>, ptr %pb
   %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
   ret <8 x float> %shufp
 }
 
 define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ZNVER4-NEXT:    retq
   %b = load <8 x float>, ptr %pb
   %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
   ret <8 x float> %shufp
 }
 
 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrm:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrm:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
 }
 
 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrm:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrm:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ZNVER4-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
@@ -586,11 +758,41 @@ define <8 x double> @transform_VUNPCKHPDZrmkz(<8 x double> %a, ptr %pb, i8 %mask
 }
 
 define <4 x double> @transform_VUNPCKLPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrmkz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmkz:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i4 %mask_int to <4 x i1>
   %b = load <4 x double>, ptr %pb
   %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -599,11 +801,41 @@ define <4 x double> @transform_VUNPCKLPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask
 }
 
 define <4 x double> @transform_VUNPCKHPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrmkz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmkz:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i4 %mask_int to <4 x i1>
   %b = load <4 x double>, ptr %pb
   %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -612,11 +844,41 @@ define <4 x double> @transform_VUNPCKHPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask
 }
 
 define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrmkz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
@@ -625,11 +887,41 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_
 }
 
 define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
@@ -666,12 +958,47 @@ define <8 x double> @transform_VUNPCKHPDZrmk(<8 x double> %a, ptr %pb, <8 x doub
 }
 
 define <4 x double> @transform_VUNPCKLPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrmk:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SKX-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-V4-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX512-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmk:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ZNVER4-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i4 %mask_int to <4 x i1>
   %b = load <4 x double>, ptr %pb
   %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -680,12 +1007,47 @@ define <4 x double> @transform_VUNPCKLPDYrmk(<4 x double> %a, ptr %pb, <4 x doub
 }
 
 define <4 x double> @transform_VUNPCKHPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrmk:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SKX-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-V4-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX512-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmk:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ZNVER4-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i4 %mask_int to <4 x i1>
   %b = load <4 x double>, ptr %pb
   %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -694,12 +1056,47 @@ define <4 x double> @transform_VUNPCKHPDYrmk(<4 x double> %a, ptr %pb, <4 x doub
 }
 
 define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrmk:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
@@ -708,12 +1105,47 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
 }
 
 define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrmk:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    kmovd %esi, %k1
+; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-SKX-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-V4-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-V4:       # %bb.0:
+; CHECK-V4-NEXT:    kmovd %esi, %k1
+; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-V4-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-V4-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    kmovd %esi, %k1
+; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk:
+; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
+; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-ZNVER4-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
@@ -721,5 +1153,4 @@ define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
   ret <2 x double> %res
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ICX-BYPASS-DELAY: {{.*}}
-; CHECK-ICX-NO-BYPASS-DELAY: {{.*}}
+; CHECK-ICX: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
index 481ecdb256ce..6940c33c9d32 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
@@ -12,10 +12,15 @@ define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounw
 ; CHECK-AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-SNB-LABEL: transform_VUNPCKLPDYrr:
 ; CHECK-SNB:       # %bb.0:
@@ -31,10 +36,15 @@ define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounw
 ; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-SNB-LABEL: transform_VUNPCKHPDYrr:
 ; CHECK-SNB:       # %bb.0:
@@ -50,15 +60,25 @@ define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwi
 ; CHECK-AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
-; CHECK-SNB-LABEL: transform_VUNPCKLPDrr:
-; CHECK-SNB:       # %bb.0:
-; CHECK-SNB-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-SNB-NEXT:    retq
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
+; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
 }
@@ -69,62 +89,140 @@ define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwi
 ; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
-; CHECK-SNB-LABEL: transform_VUNPCKHPDrr:
-; CHECK-SNB:       # %bb.0:
-; CHECK-SNB-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-SNB-NEXT:    retq
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
+; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
 }
 
 define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDYrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq
+; CHECK-AVX2-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-LABEL: transform_VUNPCKLPDYrm:
+; CHECK-SNB:       # %bb.0:
+; CHECK-SNB-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-SNB-NEXT:    retq
   %b = load <8 x float>, ptr %pb
   %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
   ret <8 x float> %shufp
 }
 
 define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDYrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq
+; CHECK-AVX2-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-LABEL: transform_VUNPCKHPDYrm:
+; CHECK-SNB:       # %bb.0:
+; CHECK-SNB-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-SNB-NEXT:    retq
   %b = load <8 x float>, ptr %pb
   %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
   ret <8 x float> %shufp
 }
 
 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKLPDrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    retq
+; CHECK-AVX2-LABEL: transform_VUNPCKLPDrm:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
+; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
 }
 
 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-LABEL: transform_VUNPCKHPDrm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-NEXT:    retq
+; CHECK-AVX2-LABEL: transform_VUNPCKHPDrm:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
+; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ICX-BYPASS-DELAY: {{.*}}
-; CHECK-ICX-NO-BYPASS-DELAY: {{.*}}
+; CHECK: {{.*}}
+; CHECK-ICX: {{.*}}
 ; CHECK-SKL: {{.*}}
-; CHECK-SNB-BYPASS-DELAY: {{.*}}
-; CHECK-SNB-NO-BYPASS-DELAY: {{.*}}
 ; CHECK-V3: {{.*}}


        


More information about the llvm-commits mailing list