[llvm] 285b8ab - [x86] limit vector increment fold to allow load folding

Fri Oct 29 12:53:24 PDT 2021

Author: Sanjay Patel
Date: 2021-10-29T15:48:35-04:00
New Revision: 285b8abce483eaa25e20d3f3bd704592c3bdec88

URL: https://github.com/llvm/llvm-project/commit/285b8abce483eaa25e20d3f3bd704592c3bdec88
DIFF: https://github.com/llvm/llvm-project/commit/285b8abce483eaa25e20d3f3bd704592c3bdec88.diff

LOG: [x86] limit vector increment fold to allow load folding

The tests are based on the example from:
https://llvm.org/PR52032

I suspect that it looks worse than it actually is. :)
That is, llvm-mca says there's no uop/timing difference with the
load folding and pcmpeq vs. broadcast on Haswell (and probably
other targets).
The load-folding definitely makes the code smaller, so it's good
for that at least. So this requires carving a narrow hole in the
transform to get just this case without changing others that look
good as-is (in other words, the transform still seems good for
most examples).

Differential Revision: https://reviews.llvm.org/D112464

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/test/CodeGen/X86/combine-sub.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index c08da6e1a820..6370190a8726 100644

--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -880,16 +880,31 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       continue;
     }
 
-    /// Convert vector increment or decrement to sub/add with an all-ones
-    /// constant:
-    /// add X, <1, 1...> --> sub X, <-1, -1...>
-    /// sub X, <1, 1...> --> add X, <-1, -1...>
-    /// The all-ones vector constant can be materialized using a pcmpeq
-    /// instruction that is commonly recognized as an idiom (has no register
-    /// dependency), so that's better/smaller than loading a splat 1 constant.
+    // Convert vector increment or decrement to sub/add with an all-ones
+    // constant:
+    // add X, <1, 1...> --> sub X, <-1, -1...>
+    // sub X, <1, 1...> --> add X, <-1, -1...>
+    // The all-ones vector constant can be materialized using a pcmpeq
+    // instruction that is commonly recognized as an idiom (has no register
+    // dependency), so that's better/smaller than loading a splat 1 constant.
+    //
+    // But don't do this if it would inhibit a potentially profitable load
+    // folding opportunity for the other operand. That only occurs with the
+    // intersection of:
+    // (1) The other operand (op0) is load foldable.
+    // (2) The op is an add (otherwise, we are *creating* an add and can still
+    //     load fold the other op).
+    // (3) The target has AVX (otherwise, we have a destructive add and can't
+    //     load fold the other op without killing the constant op).
+    // (4) The constant 1 vector has multiple uses (so it is profitable to load
+    //     into a register anyway).
+    auto mayPreventLoadFold = [&]() {
+      return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
+             N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
+             !N->getOperand(1).hasOneUse();
+    };
     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
-        N->getSimpleValueType(0).isVector()) {
-
+        N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
       APInt SplatVal;
       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
           SplatVal.isOne()) {

diff  --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll
index a399c5175dd6..f38191f55aeb 100644
--- a/llvm/test/CodeGen/X86/combine-sub.ll
+++ b/llvm/test/CodeGen/X86/combine-sub.ll
@@ -276,6 +276,10 @@ define <4 x i32> @combine_vec_neg_xor_consts(<4 x i32> %x) {
   ret <4 x i32> %sub
 }
 
+; With AVX, this could use broadcast (an extra load) and
+; load-folded 'add', but currently we favor the virtually
+; free pcmpeq instruction.
+
 define void @PR52032_oneuse_constant(<8 x i32>* %p) {
 ; SSE-LABEL: PR52032_oneuse_constant:
 ; SSE:       # %bb.0:
@@ -302,6 +306,9 @@ define void @PR52032_oneuse_constant(<8 x i32>* %p) {
   ret void
 }
 
+; With AVX, we don't transform 'add' to 'sub' because that prevents load folding.
+; With SSE, we do it because we can't load fold the other op without overwriting the constant op.
+
 define void @PR52032(<8 x i32>* %p) {
 ; SSE-LABEL: PR52032:
 ; SSE:       # %bb.0:
@@ -322,12 +329,10 @@ define void @PR52032(<8 x i32>* %p) {
 ;
 ; AVX-LABEL: PR52032:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX-NEXT:    vmovdqu 32(%rdi), %ymm2
-; AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
+; AVX-NEXT:    vpaddd (%rdi), %ymm0, %ymm1
 ; AVX-NEXT:    vmovdqu %ymm1, (%rdi)
-; AVX-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vpaddd 32(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovdqu %ymm0, 32(%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -341,6 +346,10 @@ define void @PR52032(<8 x i32>* %p) {
   ret void
 }
 
+; Same as above, but 128-bit ops:
+; With AVX, we don't transform 'add' to 'sub' because that prevents load folding.
+; With SSE, we do it because we can't load fold the other op without overwriting the constant op.
+
 define void @PR52032_2(<4 x i32>* %p) {
 ; SSE-LABEL: PR52032_2:
 ; SSE:       # %bb.0:
@@ -355,12 +364,10 @@ define void @PR52032_2(<4 x i32>* %p) {
 ;
 ; AVX-LABEL: PR52032_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu (%rdi), %xmm1
-; AVX-NEXT:    vmovdqu 16(%rdi), %xmm2
-; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
+; AVX-NEXT:    vpaddd (%rdi), %xmm0, %xmm1
 ; AVX-NEXT:    vmovdqu %xmm1, (%rdi)
-; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpaddd 16(%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu %xmm0, 16(%rdi)
 ; AVX-NEXT:    retq
   %i3 = load <4 x i32>, <4 x i32>* %p, align 4
@@ -373,6 +380,8 @@ define void @PR52032_2(<4 x i32>* %p) {
   ret void
 }
 
+; If we are starting with a 'sub', it is always better to do the transform.
+
 define void @PR52032_3(<4 x i32>* %p) {
 ; SSE-LABEL: PR52032_3:
 ; SSE:       # %bb.0:
@@ -403,6 +412,8 @@ define void @PR52032_3(<4 x i32>* %p) {
   ret void
 }
 
+; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'.
+
 define void @PR52032_4(<4 x i32>* %p, <4 x i32>* %q) {
 ; SSE-LABEL: PR52032_4:
 ; SSE:       # %bb.0: