[llvm] 07f1d8f - [X86] lowerShuffleAsDecomposedShuffleMerge(): if both inputs are broadcastable/identities, canonicalize broadcasts as such

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Sun Sep 19 07:36:04 PDT 2021


Author: Roman Lebedev
Date: 2021-09-19T17:35:37+03:00
New Revision: 07f1d8f0caa1516e0d97616adfea4aa94f7883a4

URL: https://github.com/llvm/llvm-project/commit/07f1d8f0caa1516e0d97616adfea4aa94f7883a4
DIFF: https://github.com/llvm/llvm-project/commit/07f1d8f0caa1516e0d97616adfea4aa94f7883a4.diff

LOG: [X86] lowerShuffleAsDecomposedShuffleMerge(): if both inputs are broadcastable/identities, canonicalize broadcasts as such

Split off from D108253.
Broadcast is simpler than any other shuffle we might produce
to do what we want to do here, so prefer it.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D108382

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
    llvm/test/CodeGen/X86/horizontal-sum.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 910b5a05ddf38..8e4f238b2f073 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12663,6 +12663,14 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
   return SDValue();
 }
 
+static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
+  return isUndefOrEqual(Mask, 0);
+}
+
+static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
+  return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
+}
+
 /// Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
@@ -12696,6 +12704,38 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
     }
   }
 
+  // If we effectively only demand the 0'th element of \p Input, and not only
+  // as 0'th element, then broadcast said input,
+  // and change \p InputMask to be a no-op (identity) mask.
+  auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
+                                         &DAG](SDValue &Input,
+                                               MutableArrayRef<int> InputMask) {
+    unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
+    if (!Subtarget.hasAVX2() &&
+        (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input)))
+      return;
+    if (isNoopShuffleMask(InputMask))
+      return;
+    assert(isBroadcastShuffleMask(InputMask) &&
+           "Expected to demand only the 0'th element.");
+    Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
+    for (auto I : enumerate(InputMask)) {
+      int &InputMaskElt = I.value();
+      if (InputMaskElt >= 0)
+        InputMaskElt = I.index();
+    }
+  };
+
+  // Currently, we may need to produce one shuffle per input, and blend results.
+  // It is possible that the shuffle for one of the inputs is already a no-op.
+  // See if we can simplify non-no-op shuffles into broadcasts,
+  // which we consider to be strictly better than an arbitrary shuffle.
+  if (isNoopOrBroadcastShuffleMask(V1Mask) &&
+      isNoopOrBroadcastShuffleMask(V2Mask)) {
+    canonicalizeBroadcastableInput(V1, V1Mask);
+    canonicalizeBroadcastableInput(V2, V2Mask);
+  }
+
   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
   // the shuffle may be able to fold with a load or other benefit. However, when

diff  --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index 39fed0690ece7..c7b1b92918b14 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -284,7 +284,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_unary(<4
 define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
@@ -741,7 +741,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_unary(<32
 define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -763,8 +763,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_unary(<32
 define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -800,7 +799,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_unary(<32
 ; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_unary:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index d46f9ca946574..6d053e9f78345 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -143,9 +143,9 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
 ; AVX2-SLOW-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm2
-; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32:
@@ -458,20 +458,19 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
-; AVX2-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm4
-; AVX2-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm5
+; AVX2-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
+; AVX2-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
 ; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,3]
-; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0]
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
-; AVX2-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm2
-; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm2, %xmm1
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
+; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
+; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    retq
@@ -752,16 +751,16 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
-; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm1
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm2
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm4
+; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm5
+; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
-; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
@@ -776,15 +775,15 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm4
+; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
+; AVX2-FAST-NEXT:    vpbroadcastd %xmm4, %xmm4
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
 ; AVX2-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
-; AVX2-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
-; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-FAST-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 ; AVX2-FAST-NEXT:    retq
   %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
   %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
@@ -1089,28 +1088,28 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
 ; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSSE3-FAST-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX-SLOW-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
-; AVX-SLOW-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
-; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
-; AVX-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-SLOW-NEXT:    retq
+; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
 ; AVX-FAST:       # %bb.0:
@@ -1126,6 +1125,28 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
 ; AVX-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm1
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    retq
   %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
   %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
   %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)


        


More information about the llvm-commits mailing list