[llvm] ed1b0da - [X86] combineConcatVectorOps - fold v4i64/v8x32 concat(broadcast(),broadcast()) -> permilps(concat())

Tue Oct 25 07:40:14 PDT 2022

Author: Simon Pilgrim
Date: 2022-10-25T15:37:42+01:00
New Revision: ed1b0da5570a9aaf02b11c0aa691ab9f99ea4c34

URL: https://github.com/llvm/llvm-project/commit/ed1b0da5570a9aaf02b11c0aa691ab9f99ea4c34
DIFF: https://github.com/llvm/llvm-project/commit/ed1b0da5570a9aaf02b11c0aa691ab9f99ea4c34.diff

LOG: [X86] combineConcatVectorOps - fold v4i64/v8x32 concat(broadcast(),broadcast()) -> permilps(concat())

Extend the existing v4f64 fold to handle v4i64/v8f32/v8i32 as well

Fixes #58585

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c908ec6fb48f..de1c8f03b861 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54460,11 +54460,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     unsigned NumOps = Ops.size();
     switch (Op0.getOpcode()) {
     case X86ISD::VBROADCAST: {
-      if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) {
+      if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
             return Op.getOperand(0).getValueType().is128BitVector();
-          }))
-        return DAG.getNode(X86ISD::MOVDDUP, DL, VT,
-                           ConcatSubOperand(VT, Ops, 0));
+          })) {
+        if (VT == MVT::v4f64 || VT == MVT::v4i64)
+          return DAG.getNode(X86ISD::UNPCKL, DL, VT,
+                             ConcatSubOperand(VT, Ops, 0),
+                             ConcatSubOperand(VT, Ops, 0));
+        // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
+        if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
+          return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
+                                              : X86ISD::PSHUFD,
+                             DL, VT, ConcatSubOperand(VT, Ops, 0),
+                             getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
+      }
       break;
     }
     case X86ISD::MOVDDUP:

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 5f5b799a4c6c..d8f49797ba0f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1550,16 +1550,16 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: shuffle_v4i64_0044_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
 ; AVX512VL-NEXT:    retq
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> <i32 0, i32 0>

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 6f843ab0e847..8c196a377da6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -639,19 +639,12 @@ define <8 x float> @shuffle_v8f32_00224466_v4f32(<4 x float> %a, <4 x float> %b)
 }
 
 define <8 x float> @shuffle_v8f32_00004444_v4f32(<4 x float> %a, <4 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_00004444_v4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8f32_00004444_v4f32:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm1, %xmm1
-; AVX2OR512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_00004444_v4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; ALL-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %1
 }
@@ -3289,19 +3282,12 @@ define <8 x i32> @shuffle_v8i32_32107654_v4i32(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <8 x i32> @shuffle_v8i32_00004444_v4f32(<4 x i32> %a, <4 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_00004444_v4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00004444_v4f32:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm1, %xmm1
-; AVX2OR512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v8i32_00004444_v4f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; ALL-NEXT:    retq
   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i32> %1
 }