[llvm] 6f7de81 - [RISCV] Use MULHU for more division by constant cases.

Thu Dec 9 09:11:03 PST 2021

Author: Craig Topper
Date: 2021-12-09T09:10:14-08:00
New Revision: 6f7de819b9dc970f41fde72b62ca4d1c55320641

URL: https://github.com/llvm/llvm-project/commit/6f7de819b9dc970f41fde72b62ca4d1c55320641
DIFF: https://github.com/llvm/llvm-project/commit/6f7de819b9dc970f41fde72b62ca4d1c55320641.diff

LOG: [RISCV] Use MULHU for more division by constant cases.

D113805 improved handling of i32 divu/remu on RV64. The basic idea
from that can be extended to (mul (and X, C2), C1) where C2 is any
mask constant.

We can replace the and with an SLLI by shifting by the number of
leading zeros in C2 if we also shift C1 left by XLen - lzcnt(C1)
bits. This will give the full product XLen additional trailing zeros,
putting the result in the output of MULHU. If we can't use ANDI,
ZEXT.H, or ZEXT.W, this will avoid materializing C2 in a register.

The downside is it make take 1 additional instruction to create C1.
But since that's not on the critical path, it can hopefully be
interleaved with other operations.

The previous tablegen pattern is replaced by custom isel code.

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D115310

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
    llvm/lib/Target/RISCV/RISCVInstrInfoM.td
    llvm/test/CodeGen/RISCV/div.ll
    llvm/test/CodeGen/RISCV/pr51206.ll
    llvm/test/CodeGen/RISCV/urem-lkk.ll
    llvm/test/CodeGen/RISCV/xaluo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 66a34d73dd370..516e765c9202c 100644

--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -718,6 +718,71 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
     break;
   }
+  case ISD::MUL: {
+    // Special case for calculating (mul (and X, C2), C1) where the full product
+    // fits in XLen bits. We can shift X left by the number of leading zeros in
+    // C2 and shift C1 left by XLen-lzcnt(C2). This will ensure the final
+    // product has XLen trailing zeros, putting it in the output of MULHU. This
+    // can avoid materializing a constant in a register for C2.
+
+    // RHS should be a constant.
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C || !N1C->hasOneUse())
+      break;
+
+    // LHS should be an AND with constant.
+    SDValue N0 = Node->getOperand(0);
+    if (N0.getOpcode() != ISD::AND || !isa<ConstantSDNode>(N0.getOperand(1)))
+      break;
+
+    uint64_t C2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+
+    // Constant should be a mask.
+    if (!isMask_64(C2))
+      break;
+
+    // This should be the only use of the AND unless we will use
+    // (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND
+    // constants.
+    if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF))
+      break;
+
+    // If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this
+    // optimization.
+    if (isInt<12>(C2) ||
+        (C2 == UINT64_C(0xFFFF) &&
+         (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) ||
+        (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba()))
+      break;
+
+    // We need to shift left the AND input and C1 by a total of XLen bits.
+
+    // How far left do we need to shift the AND input?
+    unsigned XLen = Subtarget->getXLen();
+    unsigned LeadingZeros = XLen - (64 - countLeadingZeros(C2));
+
+    // The constant gets shifted by the remaining amount unless that would
+    // shift bits out.
+    uint64_t C1 = N1C->getZExtValue();
+    unsigned ConstantShift = XLen - LeadingZeros;
+    if (ConstantShift > (XLen - (64 - countLeadingZeros(C1))))
+      break;
+
+    uint64_t ShiftedC1 = C1 << ConstantShift;
+    // If this RV32, we need to sign extend the constant.
+    if (XLen == 32)
+      ShiftedC1 = SignExtend64(ShiftedC1, 32);
+
+    // Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))).
+    SDNode *Imm = selectImm(CurDAG, DL, ShiftedC1, *Subtarget);
+    SDNode *SLLI =
+        CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
+                               CurDAG->getTargetConstant(LeadingZeros, DL, VT));
+    SDNode *MULHU = CurDAG->getMachineNode(RISCV::MULHU, DL, VT,
+                                           SDValue(SLLI, 0), SDValue(Imm, 0));
+    ReplaceNode(Node, MULHU);
+    return;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntNo = Node->getConstantOperandVal(0);
     switch (IntNo) {

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index a037dbf585ce4..b62e23d3b0fa6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -96,14 +96,6 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
           (REMW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtM, IsRV64]
 
-// Pattern to detect constants with no more than 32 active bits that can't
-// be materialized with lui+addiw.
-def uimm32_not_simm32 : PatLeaf<(XLenVT GPR:$a), [{
-  auto *C = dyn_cast<ConstantSDNode>(N);
-  return C && C->hasOneUse() && isUInt<32>(C->getZExtValue()) &&
-         !isInt<32>(C->getSExtValue());
-}]>;
-
 let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
 // Special case for calculating the full 64-bit product of a 32x32 unsigned
 // multiply where the inputs aren't known to be zero extended. We can shift the
@@ -111,9 +103,4 @@ let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
 // zeroing the upper 32 bits.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
           (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
-// The RHS could also be a constant that is hard to materialize. By shifting
-// left we can allow constant materialization to use LUI+ADDIW via
-// hasAllWUsers.
-def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), uimm32_not_simm32:$rs2)),
-          (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
 } // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]

diff  --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index 4a8a1bc5ac1e0..5d6b793b38a7c 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -498,12 +498,9 @@ define i16 @udiv16_constant(i16 %a) nounwind {
 ;
 ; RV32IM-LABEL: udiv16_constant:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lui a1, 16
-; RV32IM-NEXT:    addi a1, a1, -1
-; RV32IM-NEXT:    and a0, a0, a1
-; RV32IM-NEXT:    lui a1, 13
-; RV32IM-NEXT:    addi a1, a1, -819
-; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    lui a1, 838864
+; RV32IM-NEXT:    mulhu a0, a0, a1
 ; RV32IM-NEXT:    srli a0, a0, 18
 ; RV32IM-NEXT:    ret
 ;
@@ -522,12 +519,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
 ;
 ; RV64IM-LABEL: udiv16_constant:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a1, 16
-; RV64IM-NEXT:    addiw a1, a1, -1
-; RV64IM-NEXT:    and a0, a0, a1
-; RV64IM-NEXT:    lui a1, 13
-; RV64IM-NEXT:    addiw a1, a1, -819
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    lui a1, 52429
+; RV64IM-NEXT:    slli a1, a1, 4
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    mulhu a0, a0, a1
 ; RV64IM-NEXT:    srli a0, a0, 18
 ; RV64IM-NEXT:    ret
   %1 = udiv i16 %a, 5

diff  --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index 019853b5e8b06..5beeeee307d6f 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -21,12 +21,10 @@ define signext i32 @wobble() nounwind {
 ; CHECK-NEXT:    lui a2, %hi(global.1)
 ; CHECK-NEXT:    sw a0, %lo(global.1)(a2)
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    lui a1, 16
-; CHECK-NEXT:    addiw a1, a1, -1
-; CHECK-NEXT:    and a1, a0, a1
-; CHECK-NEXT:    lui a2, 13
-; CHECK-NEXT:    addiw a2, a2, -819
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a0, 48
+; CHECK-NEXT:    lui a2, 52429
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    mulhu a1, a1, a2
 ; CHECK-NEXT:    srli a1, a1, 18
 ; CHECK-NEXT:    lui a2, %hi(global.3)
 ; CHECK-NEXT:    li a3, 5

diff  --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index 16eb225d1b111..5ea52028868ea 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -48,10 +48,10 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
 ; RV64IM-LABEL: fold_urem_positive_odd:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 32
-; RV64IM-NEXT:    srli a1, a1, 32
 ; RV64IM-NEXT:    lui a2, 364242
 ; RV64IM-NEXT:    addiw a2, a2, 777
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    slli a2, a2, 32
+; RV64IM-NEXT:    mulhu a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
 ; RV64IM-NEXT:    subw a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1
@@ -179,10 +179,10 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV64IM-LABEL: combine_urem_udiv:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 32
-; RV64IM-NEXT:    srli a1, a1, 32
 ; RV64IM-NEXT:    lui a2, 364242
 ; RV64IM-NEXT:    addiw a2, a2, 777
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    slli a2, a2, 32
+; RV64IM-NEXT:    mulhu a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
 ; RV64IM-NEXT:    subw a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1

diff  --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 880129e012b66..b2005a329e065 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1201,10 +1201,10 @@ define zeroext i1 @umulo2.i32(i32 %v1, i32* %res) {
 ;
 ; RV64-LABEL: umulo2.i32:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    srli a0, a0, 32
 ; RV64-NEXT:    li a2, 13
-; RV64-NEXT:    mul a2, a0, a2
+; RV64-NEXT:    slli a2, a2, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a2, a0, a2
 ; RV64-NEXT:    srli a0, a2, 32
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    sw a2, 0(a1)