[llvm] 11e92bd - [SelectionDAG] Improve codegen for udiv by constant if any divisors are 1.

Thu Jan 5 08:41:57 PST 2023

Author: Craig Topper
Date: 2023-01-05T08:41:44-08:00
New Revision: 11e92bd61fd640bb1c833b0e185979d80204eb9e

URL: https://github.com/llvm/llvm-project/commit/11e92bd61fd640bb1c833b0e185979d80204eb9e
DIFF: https://github.com/llvm/llvm-project/commit/11e92bd61fd640bb1c833b0e185979d80204eb9e.diff

LOG: [SelectionDAG] Improve codegen for udiv by constant if any divisors are 1.

If the divisor is 1, the magic algorithm does not return a correct
result and we end up using a select to pick the numerator for those
elements at the end.

Therefore we can use undef for that element of the earlier operations
when the divisor is 1. We sometimes get this through SimplifyDemandedVectorElts,
but not always. Definitely seems like we don't if the NPQ fixup is used.

Unfortunately, DAGCombiner is unable to fold srl X, <0, undef> to X so
I had to add flags to avoid emitting the srl unless one of the shift
amounts is non-zero.

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D141022

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
    llvm/test/CodeGen/X86/combine-udiv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b7b5d79e08210..a8bcfb1932f8d 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6033,7 +6033,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
                  cast<ConstantSDNode>(N1)->getAPIntValue().countLeadingZeros());
   }
 
-  bool UseNPQ = false;
+  bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
   auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6041,18 +6041,18 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
       return false;
     const APInt& Divisor = C->getAPIntValue();
 
-    bool SelNPQ = false;
-    APInt Magic(Divisor.getBitWidth(), 0);
-    unsigned PreShift = 0, PostShift = 0;
+    SDValue PreShift, MagicFactor, NPQFactor, PostShift;
 
     // Magic algorithm doesn't work for division by 1. We need to emit a select
     // at the end.
-    // TODO: Use undef values for divisor of 1.
-    if (!Divisor.isOne()) {
+    if (Divisor.isOne()) {
+      PreShift = PostShift = DAG.getUNDEF(ShSVT);
+      MagicFactor = NPQFactor = DAG.getUNDEF(SVT);
+    } else {
       UnsignedDivisionByConstantInfo magics =
           UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
 
-      Magic = std::move(magics.Magic);
+      MagicFactor = DAG.getConstant(magics.Magic, dl, SVT);
 
       assert(magics.PreShift < Divisor.getBitWidth() &&
              "We shouldn't generate an undefined shift!");
@@ -6060,19 +6060,21 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
              "We shouldn't generate an undefined shift!");
       assert((!magics.IsAdd || magics.PreShift == 0) &&
              "Unexpected pre-shift");
-      PreShift = magics.PreShift;
-      PostShift = magics.PostShift;
-      SelNPQ = magics.IsAdd;
-    }
-
-    PreShifts.push_back(DAG.getConstant(PreShift, dl, ShSVT));
-    MagicFactors.push_back(DAG.getConstant(Magic, dl, SVT));
-    NPQFactors.push_back(
-        DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
-                               : APInt::getZero(EltBits),
-                        dl, SVT));
-    PostShifts.push_back(DAG.getConstant(PostShift, dl, ShSVT));
-    UseNPQ |= SelNPQ;
+      PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT);
+      PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT);
+      NPQFactor = DAG.getConstant(
+          magics.IsAdd ? APInt::getOneBitSet(EltBits, EltBits - 1)
+                       : APInt::getZero(EltBits),
+          dl, SVT);
+      UseNPQ |= magics.IsAdd;
+      UsePreShift |= magics.PreShift != 0;
+      UsePostShift |= magics.PostShift != 0;
+    }
+
+    PreShifts.push_back(PreShift);
+    MagicFactors.push_back(MagicFactor);
+    NPQFactors.push_back(NPQFactor);
+    PostShifts.push_back(PostShift);
     return true;
   };
 
@@ -6102,8 +6104,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   }
 
   SDValue Q = N0;
-  Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
-  Created.push_back(Q.getNode());
+  if (UsePreShift) {
+    Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
+    Created.push_back(Q.getNode());
+  }
 
   // FIXME: We should support doing a MUL in a wider type.
   auto GetMULHU = [&](SDValue X, SDValue Y) {
@@ -6152,8 +6156,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     Created.push_back(Q.getNode());
   }
 
-  Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
-  Created.push_back(Q.getNode());
+  if (UsePostShift) {
+    Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
+    Created.push_back(Q.getNode());
+  }
 
   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index 07f06dc1edca3..35b02f4d8af9a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -171,20 +171,17 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
 define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SDAG-LABEL: combine_vec_udiv_nonuniform4:
 ; SDAG:       // %bb.0:
+; SDAG-NEXT:    movi v1.16b, #171
 ; SDAG-NEXT:    adrp x8, .LCPI4_0
-; SDAG-NEXT:    adrp x9, .LCPI4_2
-; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
-; SDAG-NEXT:    adrp x8, .LCPI4_1
-; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI4_2]
+; SDAG-NEXT:    adrp x9, .LCPI4_1
 ; SDAG-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI4_1]
 ; SDAG-NEXT:    umull v1.8h, v0.8b, v1.8b
+; SDAG-NEXT:    and v0.16b, v0.16b, v3.16b
 ; SDAG-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI4_1]
-; SDAG-NEXT:    adrp x8, .LCPI4_3
-; SDAG-NEXT:    ushl v1.16b, v1.16b, v2.16b
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI4_3]
-; SDAG-NEXT:    and v1.16b, v1.16b, v3.16b
-; SDAG-NEXT:    and v0.16b, v0.16b, v2.16b
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
+; SDAG-NEXT:    ushr v1.16b, v1.16b, #7
+; SDAG-NEXT:    and v1.16b, v1.16b, v2.16b
 ; SDAG-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; SDAG-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 0de0fbdf5da08..f4b13ee495ec0 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -688,7 +688,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SSE2-LABEL: pr38477:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <u,4957,57457,4103,16385,35545,2048,2115>
 ; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubw %xmm1, %xmm2
@@ -707,7 +707,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ;
 ; SSE41-LABEL: pr38477:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,4957,57457,4103,16385,35545,2048,2115>
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psubw %xmm1, %xmm2