[llvm] 9a368d2 - [X86][SSE] shuffle(hop,hop) - canonicalize unary hop(x,x) shuffle masks

Mon Aug 10 08:09:39 PDT 2020

Author: Simon Pilgrim
Date: 2020-08-10T16:09:27+01:00
New Revision: 9a368d2b0088a8b7209c4a435427dfe8ad62744f

URL: https://github.com/llvm/llvm-project/commit/9a368d2b0088a8b7209c4a435427dfe8ad62744f
DIFF: https://github.com/llvm/llvm-project/commit/9a368d2b0088a8b7209c4a435427dfe8ad62744f.diff

LOG: [X86][SSE] shuffle(hop,hop) - canonicalize unary hop(x,x) shuffle masks

If a shuffle is referring to both the lower and upper half lanes of an unary horizontal op, then canonicalize the mask to only refer to the lower half.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/haddsub-undef.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0e665404ffbe..8ea98649d352 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35880,6 +35880,25 @@ static SDValue combineShuffleWithHorizOp(SDValue N, MVT VT, const SDLoc &DL,
   if (!isHoriz && !isPack)
     return SDValue();
 
+  // Canonicalize unary horizontal ops to only refer to lower halves.
+  if (TargetMask.size() == VT0.getVectorNumElements()) {
+    int NumElts = VT0.getVectorNumElements();
+    int NumLanes = VT0.getSizeInBits() / 128;
+    int NumEltsPerLane = NumElts / NumLanes;
+    int NumHalfEltsPerLane = NumEltsPerLane / 2;
+    for (int i = 0; i != NumElts; ++i) {
+      int &M = TargetMask[i];
+      if (isUndefOrZero(M))
+        continue;
+      if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
+          (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+        M -= NumHalfEltsPerLane;
+      if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
+          ((M - NumElts) % NumEltsPerLane) >= NumHalfEltsPerLane)
+        M -= NumHalfEltsPerLane;
+    }
+  }
+
   SmallVector<int, 16> TargetMask128, WideMask128;
   if (isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128) &&
       scaleShuffleElements(TargetMask128, 2, WideMask128)) {

diff  --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index b5f0d48dbe38..f950d0b6a723 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -1015,9 +1015,7 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
 ;
 ; SSE-FAST-LABEL: PR34724_add_v4f32_0u23:
 ; SSE-FAST:       # %bb.0:
-; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
-; SSE-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,3]
+; SSE-FAST-NEXT:    haddps %xmm1, %xmm0
 ; SSE-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23:
@@ -1034,9 +1032,7 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
 ;
 ; AVX-FAST-LABEL: PR34724_add_v4f32_0u23:
 ; AVX-FAST:       # %bb.0:
-; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,3]
+; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %4 = fadd <4 x float> %3, %0