[llvm] 2ce1698 - [X86] Fix perf bug in `permilps` -> `shufd` in X86FixupInstTuning.

Noah Goldstein via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 9 22:17:33 PDT 2023


Author: Noah Goldstein
Date: 2023-04-10T00:16:54-05:00
New Revision: 2ce1698a343c599910bceed399ca7020816b230e

URL: https://github.com/llvm/llvm-project/commit/2ce1698a343c599910bceed399ca7020816b230e
DIFF: https://github.com/llvm/llvm-project/commit/2ce1698a343c599910bceed399ca7020816b230e.diff

LOG: [X86] Fix perf bug in `permilps` -> `shufd` in X86FixupInstTuning.

We shouldn't do the transformation if we either have bypass delay OR
the new opcode has worse performance. Previous code was incorrectly
using AND.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D147727

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86FixupInstTuning.cpp
    llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
    llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 4afd6007e872..599d7499125a 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -147,7 +147,7 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
     // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
     // `vpshufd` saves a byte of code size.
-    if (!ST->hasNoDomainDelayShuffle() &&
+    if (!ST->hasNoDomainDelayShuffle() ||
         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
     MI.setDesc(TII->get(NewOpc));

diff  --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
index 55cd0f4d1345..5d031f6017c7 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
@@ -108,40 +108,25 @@ define <4 x float> @transform_VPERMILPSrrk(<4 x float> %a, <4 x float> %b, i4 %m
 }
 
 define <16 x float> @transform_VPERMILPSZrm(ptr %ap) nounwind {
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSZrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSZrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VPERMILPSZrm:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VPERMILPSZrm:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrm:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VPERMILPSZrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT:    retq
   %a = load <16 x float>, ptr %ap
   %shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   ret <16 x float> %shufp
 }
 
 define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
-; CHECK-ICX-LABEL: transform_VPERMILPSYrm:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSYrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSYrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VPERMILPSYrm:
 ; CHECK-V4:       # %bb.0:
@@ -163,10 +148,15 @@ define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
 }
 
 define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
-; CHECK-ICX-LABEL: transform_VPERMILPSrm:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VPERMILPSrm:
 ; CHECK-V4:       # %bb.0:
@@ -188,35 +178,11 @@ define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
 }
 
 define <16 x float> @transform_VPERMILPSZrmkz(ptr %ap, i16 %mask_int) nounwind {
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSZrmkz:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSZrmkz:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VPERMILPSZrmkz:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VPERMILPSZrmkz:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrmkz:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VPERMILPSZrmkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT:    retq
   %mask = bitcast i16 %mask_int to <16 x i1>
   %a = load <16 x float>, ptr %ap
   %shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -225,11 +191,17 @@ define <16 x float> @transform_VPERMILPSZrmkz(ptr %ap, i16 %mask_int) nounwind {
 }
 
 define <8 x float> @transform_VPERMILPSYrmkz(ptr %ap, i8 %mask_int) nounwind {
-; CHECK-ICX-LABEL: transform_VPERMILPSYrmkz:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSYrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSYrmkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VPERMILPSYrmkz:
 ; CHECK-V4:       # %bb.0:
@@ -256,11 +228,17 @@ define <8 x float> @transform_VPERMILPSYrmkz(ptr %ap, i8 %mask_int) nounwind {
 }
 
 define <4 x float> @transform_VPERMILPSrmkz(ptr %ap, i4 %mask_int) nounwind {
-; CHECK-ICX-LABEL: transform_VPERMILPSrmkz:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSrmkz:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSrmkz:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VPERMILPSrmkz:
 ; CHECK-V4:       # %bb.0:
@@ -287,35 +265,11 @@ define <4 x float> @transform_VPERMILPSrmkz(ptr %ap, i4 %mask_int) nounwind {
 }
 
 define <16 x float> @transform_VPERMILPSZrmk(ptr %ap, <16 x float> %b, i16 %mask_int) nounwind {
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSZrmk:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSZrmk:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VPERMILPSZrmk:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VPERMILPSZrmk:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrmk:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VPERMILPSZrmk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT:    retq
   %mask = bitcast i16 %mask_int to <16 x i1>
   %a = load <16 x float>, ptr %ap
   %shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -324,11 +278,17 @@ define <16 x float> @transform_VPERMILPSZrmk(ptr %ap, <16 x float> %b, i16 %mask
 }
 
 define <8 x float> @transform_VPERMILPSYrmk(ptr %ap, <8 x float> %b, i8 %mask_int) nounwind {
-; CHECK-ICX-LABEL: transform_VPERMILPSYrmk:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSYrmk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSYrmk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VPERMILPSYrmk:
 ; CHECK-V4:       # %bb.0:
@@ -355,11 +315,17 @@ define <8 x float> @transform_VPERMILPSYrmk(ptr %ap, <8 x float> %b, i8 %mask_in
 }
 
 define <4 x float> @transform_VPERMILPSrmk(ptr %ap, <4 x float> %b, i4 %mask_int) nounwind {
-; CHECK-ICX-LABEL: transform_VPERMILPSrmk:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSrmk:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSrmk:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VPERMILPSrmk:
 ; CHECK-V4:       # %bb.0:
@@ -384,3 +350,5 @@ define <4 x float> @transform_VPERMILPSrmk(ptr %ap, <4 x float> %b, i4 %mask_int
   %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> %b
   ret <4 x float> %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-ICX: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll
index 2f0d7d1517e8..01f8df684e74 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll
@@ -39,7 +39,7 @@ define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
 ;
 ; CHECK-AVX2-LABEL: transform_VPERMILPSYrm:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX2-DELAY-LABEL: transform_VPERMILPSYrm:
@@ -47,10 +47,15 @@ define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
 ; CHECK-AVX2-DELAY-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
 ; CHECK-AVX2-DELAY-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VPERMILPSYrm:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSYrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSYrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-SNB-LABEL: transform_VPERMILPSYrm:
 ; CHECK-SNB:       # %bb.0:
@@ -64,7 +69,7 @@ define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
 define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
 ; CHECK-AVX1-LABEL: transform_VPERMILPSrm:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX1-DELAY-LABEL: transform_VPERMILPSrm:
@@ -74,7 +79,7 @@ define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
 ;
 ; CHECK-AVX2-LABEL: transform_VPERMILPSrm:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX2-DELAY-LABEL: transform_VPERMILPSrm:
@@ -82,21 +87,28 @@ define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
 ; CHECK-AVX2-DELAY-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; CHECK-AVX2-DELAY-NEXT:    retq
 ;
-; CHECK-ICX-LABEL: transform_VPERMILPSrm:
-; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
-; CHECK-ICX-NEXT:    retq
+; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
-; CHECK-SNB-LABEL: transform_VPERMILPSrm:
-; CHECK-SNB:       # %bb.0:
-; CHECK-SNB-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
-; CHECK-SNB-NEXT:    retq
+; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
+;
+; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %a = load <4 x float>, ptr %ap
   %shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %shufp
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ICX-BYPASS-DELAY: {{.*}}
-; CHECK-ICX-NO-BYPASS-DELAY: {{.*}}
-; CHECK-SNB-BYPASS-DELAY: {{.*}}
-; CHECK-SNB-NO-BYPASS-DELAY: {{.*}}
+; CHECK-ICX: {{.*}}


        


More information about the llvm-commits mailing list