[llvm] e9fd882 - [DAGCombiner] Add decomposition patterns for Mul-by-Imm.

Fri Oct 9 01:52:17 PDT 2020

Author: Esme-Yi
Date: 2020-10-09T08:51:40Z
New Revision: e9fd8823baf508ff4e96cff9c24bc301ef4d100e

URL: https://github.com/llvm/llvm-project/commit/e9fd8823baf508ff4e96cff9c24bc301ef4d100e
DIFF: https://github.com/llvm/llvm-project/commit/e9fd8823baf508ff4e96cff9c24bc301ef4d100e.diff

LOG: [DAGCombiner] Add decomposition patterns for Mul-by-Imm.

Summary: This patch is derived from D87384.
In this patch we expand the existing decomposition of mul-by-constant to be more general by implementing 2 patterns:
```
  mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
  mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
```
The conversion will be trigged if the multiplier is a big constant that the target can't use a single multiplication instruction to handle. This is controlled by the hook `decomposeMulByConstant`.
More over, the conversion benefits from an ILP improvement since the instructions are independent. A case with the sequence like following also gets benefit since a shift instruction is saved.

```
*res1 = a * 0x8800;
*res2 = a * 0x8080;
```

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D88201

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/test/CodeGen/PowerPC/mulli.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c36a085599bf..65a5c7911503 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3622,19 +3622,30 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
                                       getShiftAmountTy(N0.getValueType()))));
   }
 
-  // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub.
+  // Try to transform:
+  // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
   // mul x, (2^N + 1) --> add (shl x, N), x
   // mul x, (2^N - 1) --> sub (shl x, N), x
   // Examples: x * 33 --> (x << 5) + x
   //           x * 15 --> (x << 4) - x
   //           x * -33 --> -((x << 5) + x)
   //           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
+  // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
+  // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
+  // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
+  // Examples: x * 0x8800 --> (x << 15) + (x << 11)
+  //           x * 0xf800 --> (x << 16) - (x << 11)
+  //           x * -0x8800 --> -((x << 15) + (x << 11))
+  //           x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
   if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
     // TODO: We could handle more general decomposition of any constant by
     //       having the target set a limit on number of ops and making a
     //       callback to determine that sequence (similar to sqrt expansion).
     unsigned MathOp = ISD::DELETED_NODE;
     APInt MulC = ConstValue1.abs();
+    // The constant `2` should be treated as (2^0 + 1).
+    unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros();
+    MulC.lshrInPlace(TZeros);
     if ((MulC - 1).isPowerOf2())
       MathOp = ISD::ADD;
     else if ((MulC + 1).isPowerOf2())
@@ -3643,12 +3654,17 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     if (MathOp != ISD::DELETED_NODE) {
       unsigned ShAmt =
           MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
+      ShAmt += TZeros;
       assert(ShAmt < VT.getScalarSizeInBits() &&
              "multiply-by-constant generated out of bounds shift");
       SDLoc DL(N);
       SDValue Shl =
           DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
-      SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0);
+      SDValue R =
+          TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
+                               DAG.getNode(ISD::SHL, DL, VT, N0,
+                                           DAG.getConstant(TZeros, DL, VT)))
+                 : DAG.getNode(MathOp, DL, VT, Shl, N0);
       if (ConstValue1.isNegative())
         R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R);
       return R;

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 6a0581788448..57d90ef446f3 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16057,6 +16057,32 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
+bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                               SDValue C) const {
+  // Check integral scalar types.
+  if (!VT.isScalarInteger())
+    return false;
+  if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
+    if (!ConstNode->getAPIntValue().isSignedIntN(64))
+      return false;
+    // This transformation will generate >= 2 operations. But the following
+    // cases will generate <= 2 instructions during ISEL. So exclude them.
+    // 1. If the constant multiplier fits 16 bits, it can be handled by one
+    // HW instruction, ie. MULLI
+    // 2. If the multiplier after shifted fits 16 bits, an extra shift
+    // instruction is needed than case 1, ie. MULLI and RLDICR
+    int64_t Imm = ConstNode->getSExtValue();
+    unsigned Shift = countTrailingZeros<uint64_t>(Imm);
+    Imm >>= Shift;
+    if (isInt<16>(Imm))
+      return false;
+    if (isPowerOf2_64(Imm + 1) || isPowerOf2_64(Imm - 1) ||
+        isPowerOf2_64(1 - Imm) || isPowerOf2_64(-1 - Imm))
+      return true;
+  }
+  return false;
+}
+
 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                    EVT VT) const {
   return isFMAFasterThanFMulAndFAdd(

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index b0b5b98a4165..62a412a6c619 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -931,6 +931,9 @@ namespace llvm {
       return true;
     }
 
+    bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                SDValue C) const override;
+
     bool isDesirableToTransformToIntegerOp(unsigned Opc,
                                            EVT VT) const override {
       // Only handle float load/store pair because float(fpr) load/store

diff  --git a/llvm/test/CodeGen/PowerPC/mulli.ll b/llvm/test/CodeGen/PowerPC/mulli.ll
index 92fcac0741aa..19c2d9193c38 100644
--- a/llvm/test/CodeGen/PowerPC/mulli.ll
+++ b/llvm/test/CodeGen/PowerPC/mulli.ll
@@ -48,10 +48,9 @@ define i64 @test4(i64 %x) {
 define i64 @test5(i64 %x) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lis 4, 16
-; CHECK-NEXT:    ori 4, 4, 1
-; CHECK-NEXT:    sldi 4, 4, 12
-; CHECK-NEXT:    mulld 3, 3, 4
+; CHECK-NEXT:    sldi 4, 3, 12
+; CHECK-NEXT:    sldi 3, 3, 32
+; CHECK-NEXT:    add 3, 3, 4
 ; CHECK-NEXT:    blr
   %y = mul i64 %x, 4294971392
   ret i64 %y
@@ -60,10 +59,10 @@ define i64 @test5(i64 %x) {
 define i64 @test6(i64 %x) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lis 4, -17
-; CHECK-NEXT:    ori 4, 4, 65535
-; CHECK-NEXT:    sldi 4, 4, 12
-; CHECK-NEXT:    mulld 3, 3, 4
+; CHECK-NEXT:    sldi 4, 3, 12
+; CHECK-NEXT:    sldi 3, 3, 32
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    neg 3, 3
 ; CHECK-NEXT:    blr
   %y = mul i64 %x, -4294971392
   ret i64 %y
@@ -72,10 +71,9 @@ define i64 @test6(i64 %x) {
 define i64 @test7(i64 %x) {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lis 4, 31
-; CHECK-NEXT:    ori 4, 4, 65535
-; CHECK-NEXT:    sldi 4, 4, 13
-; CHECK-NEXT:    mulld 3, 3, 4
+; CHECK-NEXT:    sldi 4, 3, 34
+; CHECK-NEXT:    sldi 3, 3, 13
+; CHECK-NEXT:    sub 3, 4, 3
 ; CHECK-NEXT:    blr
   %y = mul i64 %x, 17179860992
   ret i64 %y
@@ -84,10 +82,9 @@ define i64 @test7(i64 %x) {
 define i64 @test8(i64 %x) {
 ; CHECK-LABEL: test8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li 4, -4
-; CHECK-NEXT:    sldi 4, 4, 32
-; CHECK-NEXT:    ori 4, 4, 8192
-; CHECK-NEXT:    mulld 3, 3, 4
+; CHECK-NEXT:    sldi 4, 3, 13
+; CHECK-NEXT:    sldi 3, 3, 34
+; CHECK-NEXT:    sub 3, 4, 3
 ; CHECK-NEXT:    blr
   %y = mul i64 %x, -17179860992
   ret i64 %y
@@ -96,12 +93,11 @@ define i64 @test8(i64 %x) {
 define i64 @test9(i64 %x) {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lis 4, 16
+; CHECK-NEXT:    sldi 4, 3, 12
+; CHECK-NEXT:    sldi 5, 3, 32
+; CHECK-NEXT:    add 4, 5, 4
 ; CHECK-NEXT:    li 5, 8193
-; CHECK-NEXT:    ori 4, 4, 1
 ; CHECK-NEXT:    sldi 5, 5, 19
-; CHECK-NEXT:    sldi 4, 4, 12
-; CHECK-NEXT:    mulld 4, 3, 4
 ; CHECK-NEXT:    mulld 3, 3, 5
 ; CHECK-NEXT:    sub 3, 4, 3
 ; CHECK-NEXT:    blr
@@ -114,13 +110,8 @@ define i64 @test9(i64 %x) {
 define i64 @test10(i64 %x) {
 ; CHECK-LABEL: test10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lis 4, 31
-; CHECK-NEXT:    lis 5, 16383
-; CHECK-NEXT:    ori 4, 4, 65535
-; CHECK-NEXT:    ori 5, 5, 57344
-; CHECK-NEXT:    sldi 4, 4, 13
-; CHECK-NEXT:    mulld 4, 3, 4
-; CHECK-NEXT:    mulld 3, 3, 5
+; CHECK-NEXT:    sldi 4, 3, 34
+; CHECK-NEXT:    sldi 3, 3, 30
 ; CHECK-NEXT:    sub 3, 4, 3
 ; CHECK-NEXT:    blr
   %y = mul i64 %x, 17179860992