[llvm] a1c9529 - [X86][AVX] isHorizontalBinOp - relax no-lane-crossing limit for AVX1-only targets.

Wed Jul 29 12:49:25 PDT 2020

Author: Simon Pilgrim
Date: 2020-07-29T20:49:10+01:00
New Revision: a1c9529e602adcc25162f93fd4563fbe3d04ab62

URL: https://github.com/llvm/llvm-project/commit/a1c9529e602adcc25162f93fd4563fbe3d04ab62
DIFF: https://github.com/llvm/llvm-project/commit/a1c9529e602adcc25162f93fd4563fbe3d04ab62.diff

LOG: [X86][AVX] isHorizontalBinOp - relax no-lane-crossing limit for AVX1-only targets.

Instead of never accepting v8f32/v4f64 FHADD/FHSUB if the input shuffle masks cross lanes, perform the matching and determine if the post shuffle mask simplifies to a 'whole lane shuffle' mask - in which case we are guaranteed to cheaply perform this as a VPERM2F128 shuffle.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/haddsub-4.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 556bb5846881..8d95c066f5a0 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5625,6 +5625,11 @@ static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
   return false;
 }
 
+static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
+  SmallVector<int, 32> WidenedMask;
+  return scaleShuffleElements(Mask, NumDstElts, WidenedMask);
+}
+
 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -44486,12 +44491,6 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
       RMask.push_back(i);
   }
 
-  // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
-  if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
-      (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
-       isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
-    return false;
-
   // If A and B occur in reverse order in RHS, then canonicalize by commuting
   // RHS operands and shuffle mask.
   if (A != C) {
@@ -44554,6 +44553,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
   if (IsIdentityPostShuffle)
     PostShuffleMask.clear();
 
+  // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split), unless
+  // the shuffle can widen to shuffle entire lanes, which should still be quick.
+  if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+      isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(),
+                                PostShuffleMask) &&
+      !canScaleShuffleElements(PostShuffleMask, 2))
+    return false;
+
   // Assume a SingleSource HOP if we only shuffle one input and don't need to
   // shuffle the result.
   if (!shouldUseHorizontalOp(LHS == RHS &&

diff  --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll
index 4c1dc71982aa..31b6617b45a9 100644
--- a/llvm/test/CodeGen/X86/haddsub-4.ll
+++ b/llvm/test/CodeGen/X86/haddsub-4.ll
@@ -151,13 +151,11 @@ define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) noun
 ;
 ; AVX1-LABEL: hadd_reverse_v8f64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm2
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd %ymm2, %ymm1
+; AVX1-NEXT:    vhaddpd %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vmovapd %ymm3, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: hadd_reverse_v8f64: