[llvm] fa19d67 - [X86][AVX] Extend combineCommutableSHUFP to handle v8f32 and v16f32 commutable shufps patterns

Sun Jan 26 11:04:50 PST 2020

Author: Simon Pilgrim
Date: 2020-01-26T19:04:12Z
New Revision: fa19d67a2a237c6b85c96f350b349851416c9bd9

URL: https://github.com/llvm/llvm-project/commit/fa19d67a2a237c6b85c96f350b349851416c9bd9
DIFF: https://github.com/llvm/llvm-project/commit/fa19d67a2a237c6b85c96f350b349851416c9bd9.diff

LOG: [X86][AVX] Extend combineCommutableSHUFP to handle v8f32 and v16f32 commutable shufps patterns

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
    llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 38f0f52a0440..26a11dc4c548 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34555,8 +34555,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
                                       SelectionDAG &DAG) {
-  // TODO: Add general vXf32 + vXf64 support.
-  if (VT != MVT::v4f32)
+  // TODO: Add vXf64 support.
+  if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
     return SDValue();
 
   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 790a6798e149..99a5c5f63d7d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1308,16 +1308,14 @@ define <8 x float> @shuffle_v8f32_32107654_v4f32(<4 x float> %a, <4 x float> %b)
 define <8 x float> @shuffle_mem_v8f32_8BA0CFE4(<8 x float> %a0, <8 x float>* %a1) {
 ; AVX1OR2-LABEL: shuffle_mem_v8f32_8BA0CFE4:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovaps (%rdi), %ymm1
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[2,0],ymm0[4,7],ymm1[6,4]
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[2,0],mem[0,0],ymm0[6,4],mem[4,4]
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-SLOW-LABEL: shuffle_mem_v8f32_8BA0CFE4:
 ; AVX512VL-SLOW:       # %bb.0:
-; AVX512VL-SLOW-NEXT:    vmovaps (%rdi), %ymm1
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[2,0],ymm0[4,7],ymm1[6,4]
+; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[2,0],mem[0,0],ymm0[6,4],mem[4,4]
+; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
 ; AVX512VL-SLOW-NEXT:    retq
 ;
 ; AVX512VL-FAST-LABEL: shuffle_mem_v8f32_8BA0CFE4:

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 14faa40b654b..3b14959035eb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -274,9 +274,8 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
 define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, <16 x float>* %a1) {
 ; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps (%rdi), %zmm1
-; ALL-NEXT:    vshufps {{.*#+}} zmm1 = zmm1[0,0],zmm0[2,0],zmm1[4,4],zmm0[6,4],zmm1[8,8],zmm0[10,8],zmm1[12,12],zmm0[14,12]
-; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[2,0],zmm0[4,7],zmm1[6,4],zmm0[8,11],zmm1[10,8],zmm0[12,15],zmm1[14,12]
+; ALL-NEXT:    vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
+; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
 ; ALL-NEXT:    retq
   %1 = load <16 x float>, <16 x float>* %a1
   %2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>