[llvm] r325604 - [X86] Correct SHRUNKBLEND creation to work correctly when there are multiple uses of the condition.

Tue Feb 20 09:58:17 PST 2018

Author: ctopper
Date: Tue Feb 20 09:58:17 2018
New Revision: 325604

URL: http://llvm.org/viewvc/llvm-project?rev=325604&view=rev
Log:
[X86] Correct SHRUNKBLEND creation to work correctly when there are multiple uses of the condition.

SimplifyDemandedBits forces the demanded mask to all 1s if the node has multiple uses, unless the AssumeSingleUse flag is set.

So previously we were only really likely to simplify something if the condition had a single use. And on the off chance we did simplify with multiple uses the demanded mask being used was all ones so there was no reason to create a shrunkblend.

This patch now checks that the condition is only used by selects first, and then sets the AssumeSingleUse flag for the simplifcation. Then we convert the selects to shrunkblend, and finally replace condition.

Differential Revision: https://reviews.llvm.org/D43446

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vselect.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=325604&r1=325603&r2=325604&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Feb 20 09:58:17 2018
@@ -31910,31 +31910,29 @@ static SDValue combineSelect(SDNode *N,
     if (VT.is512BitVector())
       return SDValue();
 
-    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
-    APInt DemandedMask(APInt::getSignMask(BitWidth));
-    KnownBits Known;
-    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                          !DCI.isBeforeLegalizeOps());
-    if (TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
-      // If we changed the computation somewhere in the DAG, this change will
-      // affect all users of Cond. Make sure it is fine and update all the nodes
-      // so that we do not use the generic VSELECT anymore. Otherwise, we may
-      // perform wrong optimizations as we messed with the actual expectation
-      // for the vector boolean values.
-      if (Cond != TLO.Old) {
-        // Check all uses of the condition operand to check whether it will be
-        // consumed by non-BLEND instructions. Those may require that all bits
-        // are set properly.
-        for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
-             UI != UE; ++UI) {
-          // TODO: Add other opcodes eventually lowered into BLEND.
-          if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
-            return SDValue();
-        }
+    bool CanShrinkCond = true;
+    for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+         UI != UE; ++UI) {
+      // TODO: Add other opcodes eventually lowered into BLEND.
+      if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0) {
+        CanShrinkCond = false;
+        break;
+      }
+    }
 
-        // Update all users of the condition before committing the change, so
-        // that the VSELECT optimizations that expect the correct vector boolean
-        // value will not be triggered.
+    if (CanShrinkCond) {
+      assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+      APInt DemandedMask(APInt::getSignMask(BitWidth));
+      KnownBits Known;
+      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                            !DCI.isBeforeLegalizeOps());
+      if (TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0,
+                                   /*AssumeSingleUse*/true)) {
+        // If we changed the computation somewhere in the DAG, this change will
+        // affect all users of Cond. Update all the nodes so that we do not use
+        // the generic VSELECT anymore. Otherwise, we may perform wrong
+        // optimizations as we messed with the actual expectation for the vector
+        // boolean values.
         for (SDNode *U : Cond->uses()) {
           SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
                                    U->getValueType(0), Cond, U->getOperand(1),
@@ -31942,14 +31940,8 @@ static SDValue combineSelect(SDNode *N,
           DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
         }
         DCI.CommitTargetLoweringOpt(TLO);
-        return SDValue();
+        return SDValue(N, 0);
       }
-      // Only Cond (rather than other nodes in the computation chain) was
-      // changed. Change the condition just for N to keep the opportunity to
-      // optimize all other users their own way.
-      SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
-      return SDValue();
     }
   }
 

Modified: llvm/trunk/test/CodeGen/X86/vselect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vselect.ll?rev=325604&r1=325603&r2=325604&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vselect.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vselect.ll Tue Feb 20 09:58:17 2018
@@ -503,8 +503,6 @@ define <2 x i64> @shrunkblend_2uses(<2 x
 ; SSE41-LABEL: shrunkblend_2uses:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    psllq $63, %xmm0
-; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
 ; SSE41-NEXT:    paddq %xmm2, %xmm4
@@ -514,8 +512,6 @@ define <2 x i64> @shrunkblend_2uses(<2 x
 ; AVX-LABEL: shrunkblend_2uses:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpsllq $63, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm5, %xmm0
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm4, %xmm0
 ; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0