[llvm] dbead23 - [RISCV] Add custom isel for (add X, imm) used by load/stores.

Thu Jun 2 13:45:40 PDT 2022

Author: Craig Topper
Date: 2022-06-02T13:45:32-07:00
New Revision: dbead2388b48c1730f54f9aef074a81254573797

URL: https://github.com/llvm/llvm-project/commit/dbead2388b48c1730f54f9aef074a81254573797
DIFF: https://github.com/llvm/llvm-project/commit/dbead2388b48c1730f54f9aef074a81254573797.diff

LOG: [RISCV] Add custom isel for (add X, imm) used by load/stores.

If the imm is out of range for an ADDI, we will materialize it in
a register using multiple instructions. If the ADD is used by a
load/store, doPeepholeLoadStoreADDI can try to pull an ADDI from
the constant materialization into the load/store offset. This only
works if the ADD has a single use, otherwise the peephole would have
to rebuild multiple nodes.

This patch instead tries to solve the problem when the add is selected.
We check that the add is only used by loads/stores and if it is
we will select it to (ADDI (ADD X, Imm-Lo12), Lo12). This will enable
the simple case in doPeepholeLoadStoreADDI that can bypass an ADDI
used as a pointer. As a result we can remove the more complicated
peephole from doPeepholeLoadStoreADDI.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D126576

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
    llvm/test/CodeGen/RISCV/mem.ll
    llvm/test/CodeGen/RISCV/mem64.ll
    llvm/test/CodeGen/RISCV/split-offsets.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index af91cd59d9c7..b6606161c491 100644

--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -153,6 +153,40 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() {
     CurDAG->RemoveDeadNodes();
 }
 
+// Returns true if N is a MachineSDNode that has a reg and simm12 memory
+// operand. The indices of the base pointer and offset are returned in BaseOpIdx
+// and OffsetOpIdx.
+static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx,
+                         unsigned &OffsetOpIdx) {
+  switch (N->getMachineOpcode()) {
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLH:
+  case RISCV::FLW:
+  case RISCV::FLD:
+    BaseOpIdx = 0;
+    OffsetOpIdx = 1;
+    return true;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSH:
+  case RISCV::FSW:
+  case RISCV::FSD:
+    BaseOpIdx = 1;
+    OffsetOpIdx = 2;
+    return true;
+  }
+
+  return false;
+}
+
 static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL,
                                          const MVT VT, int64_t Imm,
                                          const RISCVSubtarget &Subtarget) {
@@ -664,6 +698,70 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
     return;
   }
+  case ISD::ADD: {
+    // Try to select ADD + immediate used as memory addresses to
+    // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by
+    // doPeepholeLoadStoreADDI.
+
+    // LHS should be an immediate.
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C)
+      break;
+
+    int64_t Offset = N1C->getSExtValue();
+    int64_t Lo12 = SignExtend64<12>(Offset);
+
+    // Don't do this if the lower 12 bits are 0 or we could use ADDI directly.
+    if (Lo12 == 0 || isInt<12>(Offset))
+      break;
+
+    // Don't do this if we can use a pair of ADDIs.
+    if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2))
+      break;
+
+    bool AllPointerUses = true;
+    for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+
+      // Is this user a memory instruction that uses a register and immediate
+      // that has this ADD as its pointer.
+      unsigned BaseOpIdx, OffsetOpIdx;
+      if (!User->isMachineOpcode() ||
+          !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) ||
+          UI.getOperandNo() != BaseOpIdx) {
+        AllPointerUses = false;
+        break;
+      }
+
+      // If the memory instruction already has an offset, make sure the combined
+      // offset is foldable.
+      int64_t MemOffs =
+          cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue();
+      MemOffs += Lo12;
+      if (!isInt<12>(MemOffs)) {
+        AllPointerUses = false;
+        break;
+      }
+    }
+
+    if (!AllPointerUses)
+      break;
+
+    Offset -= Lo12;
+    // Restore sign bits for RV32.
+    if (!Subtarget->is64Bit())
+      Offset = SignExtend64<32>(Offset);
+
+    // Emit (ADDI (ADD X, Hi), Lo)
+    SDNode *Imm = selectImm(CurDAG, DL, VT, Offset, *Subtarget);
+    SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT,
+                                         Node->getOperand(0), SDValue(Imm, 0));
+    SDNode *ADDI =
+        CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0),
+                               CurDAG->getTargetConstant(Lo12, DL, VT));
+    ReplaceNode(Node, ADDI);
+    return;
+  }
   case ISD::SRL: {
     // Optimize (srl (and X, C2), C) ->
     //          (srli (slli X, (XLen-C3), (XLen-C3) + C)
@@ -2097,37 +2195,9 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
 //    -> (store val, (add base, src), off1+off2)
 // This is possible when off1+off2 fits a 12-bit immediate.
 bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
-  int OffsetOpIdx;
-  int BaseOpIdx;
-
-  // Only attempt this optimisation for I-type loads and S-type stores.
-  switch (N->getMachineOpcode()) {
-  default:
+  unsigned OffsetOpIdx, BaseOpIdx;
+  if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx))
     return false;
-  case RISCV::LB:
-  case RISCV::LH:
-  case RISCV::LW:
-  case RISCV::LBU:
-  case RISCV::LHU:
-  case RISCV::LWU:
-  case RISCV::LD:
-  case RISCV::FLH:
-  case RISCV::FLW:
-  case RISCV::FLD:
-    BaseOpIdx = 0;
-    OffsetOpIdx = 1;
-    break;
-  case RISCV::SB:
-  case RISCV::SH:
-  case RISCV::SW:
-  case RISCV::SD:
-  case RISCV::FSH:
-  case RISCV::FSW:
-  case RISCV::FSD:
-    BaseOpIdx = 1;
-    OffsetOpIdx = 2;
-    break;
-  }
 
   if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
     return false;
@@ -2137,54 +2207,8 @@ bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
   if (!Base.isMachineOpcode())
     return false;
 
-  // There is a ADD between ADDI and load/store. We can only fold ADDI that
-  // do not have a FrameIndex operand.
-  SDValue Add;
-  unsigned AddBaseIdx;
-  if (Base.getMachineOpcode() == RISCV::ADD && Base.hasOneUse()) {
-    Add = Base;
-    SDValue Op0 = Base.getOperand(0);
-    SDValue Op1 = Base.getOperand(1);
-    if (Op0.isMachineOpcode() && Op0.getMachineOpcode() == RISCV::ADDI &&
-        !isa<FrameIndexSDNode>(Op0.getOperand(0)) &&
-        isa<ConstantSDNode>(Op0.getOperand(1))) {
-      AddBaseIdx = 1;
-      Base = Op0;
-    } else if (Op1.isMachineOpcode() && Op1.getMachineOpcode() == RISCV::ADDI &&
-               !isa<FrameIndexSDNode>(Op1.getOperand(0)) &&
-               isa<ConstantSDNode>(Op1.getOperand(1))) {
-      AddBaseIdx = 0;
-      Base = Op1;
-    } else if (Op1.isMachineOpcode() &&
-               Op1.getMachineOpcode() == RISCV::ADDIW &&
-               isa<ConstantSDNode>(Op1.getOperand(1)) &&
-               Op1.getOperand(0).isMachineOpcode() &&
-               Op1.getOperand(0).getMachineOpcode() == RISCV::LUI) {
-      // We found an LUI+ADDIW constant materialization. We might be able to
-      // fold the ADDIW offset if it could be treated as ADDI.
-      // Emulate the constant materialization to see if the result would be
-      // a simm32 if ADDI was used instead of ADDIW.
-
-      // First the LUI.
-      uint64_t Imm = Op1.getOperand(0).getConstantOperandVal(0);
-      Imm <<= 12;
-      Imm = SignExtend64<32>(Imm);
-
-      // Then the ADDI.
-      uint64_t LoImm = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
-      Imm += LoImm;
-
-      // If the result isn't a simm32, we can't do the optimization.
-      if (!isInt<32>(Imm))
-        return false;
-
-      AddBaseIdx = 0;
-      Base = Op1;
-    } else
-      return false;
-  } else if (Base.getMachineOpcode() == RISCV::ADDI) {
-    // If the base is an ADDI, we can merge it in to the load/store.
-  } else
+  // If the base is an ADDI, we can merge it in to the load/store.
+  if (Base.getMachineOpcode() != RISCV::ADDI)
     return false;
 
   SDValue ImmOperand = Base.getOperand(1);
@@ -2231,26 +2255,13 @@ bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
   LLVM_DEBUG(N->dump(CurDAG));
   LLVM_DEBUG(dbgs() << "\n");
 
-  if (Add)
-    Add = SDValue(CurDAG->UpdateNodeOperands(Add.getNode(),
-                                             Add.getOperand(AddBaseIdx),
-                                             Base.getOperand(0)),
-                  0);
-
   // Modify the offset operand of the load/store.
   if (BaseOpIdx == 0) { // Load
-    if (Add)
-      N = CurDAG->UpdateNodeOperands(N, Add, ImmOperand, N->getOperand(2));
-    else
-      N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
-                                     N->getOperand(2));
+    N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                                   N->getOperand(2));
   } else { // Store
-    if (Add)
-      N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Add, ImmOperand,
-                                     N->getOperand(3));
-    else
-      N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
-                                     ImmOperand, N->getOperand(3));
+    N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                                   ImmOperand, N->getOperand(3));
   }
 
   return true;

diff  --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll
index 7af934fdacab..833496705a70 100644
--- a/llvm/test/CodeGen/RISCV/mem.ll
+++ b/llvm/test/CodeGen/RISCV/mem.ll
@@ -227,10 +227,9 @@ define i32 @lw_sw_far_local(i32* %a, i32 %b)  {
 ; RV32I-LABEL: lw_sw_far_local:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 4
-; RV32I-NEXT:    addi a2, a2, -4
 ; RV32I-NEXT:    add a2, a0, a2
-; RV32I-NEXT:    lw a0, 0(a2)
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    lw a0, -4(a2)
+; RV32I-NEXT:    sw a1, -4(a2)
 ; RV32I-NEXT:    ret
   %1 = getelementptr inbounds i32, i32* %a, i64 4095
   %2 = load volatile i32, i32* %1
@@ -266,10 +265,9 @@ define i32 @lw_sw_really_far_local(i32* %a, i32 %b)  {
 ; RV32I-LABEL: lw_sw_really_far_local:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    addi a2, a2, -2048
 ; RV32I-NEXT:    add a2, a0, a2
-; RV32I-NEXT:    lw a0, 0(a2)
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    lw a0, -2048(a2)
+; RV32I-NEXT:    sw a1, -2048(a2)
 ; RV32I-NEXT:    ret
   %1 = getelementptr inbounds i32, i32* %a, i32 536870400
   %2 = load volatile i32, i32* %1

diff  --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll
index 47ab20e950c5..10255f4f7f0c 100644
--- a/llvm/test/CodeGen/RISCV/mem64.ll
+++ b/llvm/test/CodeGen/RISCV/mem64.ll
@@ -257,10 +257,9 @@ define i64 @lw_sw_far_local(i64* %a, i64 %b)  {
 ; RV64I-LABEL: lw_sw_far_local:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 8
-; RV64I-NEXT:    addiw a2, a2, -8
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    ld a0, 0(a2)
-; RV64I-NEXT:    sd a1, 0(a2)
+; RV64I-NEXT:    ld a0, -8(a2)
+; RV64I-NEXT:    sd a1, -8(a2)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 4095
   %2 = load volatile i64, i64* %1
@@ -273,10 +272,10 @@ define i64 @lw_sw_far_local(i64* %a, i64 %b)  {
 define i64 @lw_really_far_local(i64* %a)  {
 ; RV64I-LABEL: lw_really_far_local:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    addiw a1, a1, -2048
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 31
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ld a0, -2048(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 268435200
   %2 = load volatile i64, i64* %1
@@ -288,10 +287,10 @@ define i64 @lw_really_far_local(i64* %a)  {
 define void @st_really_far_local(i64* %a, i64 %b)  {
 ; RV64I-LABEL: st_really_far_local:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    addiw a2, a2, -2048
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 31
 ; RV64I-NEXT:    add a0, a0, a2
-; RV64I-NEXT:    sd a1, 0(a0)
+; RV64I-NEXT:    sd a1, -2048(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 268435200
   store i64 %b, i64* %1
@@ -303,11 +302,11 @@ define void @st_really_far_local(i64* %a, i64 %b)  {
 define i64 @lw_sw_really_far_local(i64* %a, i64 %b)  {
 ; RV64I-LABEL: lw_sw_really_far_local:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    addiw a2, a2, -2048
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 31
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    ld a0, 0(a2)
-; RV64I-NEXT:    sd a1, 0(a2)
+; RV64I-NEXT:    ld a0, -2048(a2)
+; RV64I-NEXT:    sd a1, -2048(a2)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 268435200
   %2 = load volatile i64, i64* %1

diff  --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index 35671d8c9481..2510baf386e2 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -13,30 +13,28 @@ define void @test1([65536 x i32]** %sp, [65536 x i32]* %t, i32 %n) {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    lui a2, 20
-; RV32I-NEXT:    addi a2, a2, -1920
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    li a2, 2
-; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a2, -1920(a0)
 ; RV32I-NEXT:    li a3, 1
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a1)
-; RV32I-NEXT:    sw a2, 4(a1)
+; RV32I-NEXT:    sw a3, -1916(a0)
+; RV32I-NEXT:    sw a3, -1920(a1)
+; RV32I-NEXT:    sw a2, -1916(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test1:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    lui a2, 20
-; RV64I-NEXT:    addiw a2, a2, -1920
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    li a2, 2
-; RV64I-NEXT:    sw a2, 0(a0)
+; RV64I-NEXT:    sw a2, -1920(a0)
 ; RV64I-NEXT:    li a3, 1
-; RV64I-NEXT:    sw a3, 4(a0)
-; RV64I-NEXT:    sw a3, 0(a1)
-; RV64I-NEXT:    sw a2, 4(a1)
+; RV64I-NEXT:    sw a3, -1916(a0)
+; RV64I-NEXT:    sw a3, -1920(a1)
+; RV64I-NEXT:    sw a2, -1916(a1)
 ; RV64I-NEXT:    ret
 entry:
   %s = load [65536 x i32]*, [65536 x i32]** %sp
@@ -119,28 +117,27 @@ while_end:
 }
 
 ; GEPs have been manually split so the base GEP does not get used by any memory
-; instructions. Make sure we use a small offset in each of the stores.
+; instructions. Make sure we use an offset and common base for each of the
+; stores.
 define void @test3([65536 x i32]* %t) {
 ; RV32I-LABEL: test3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lui a1, 20
-; RV32I-NEXT:    addi a1, a1, -1920
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a1, -1916(a0)
 ; RV32I-NEXT:    li a1, 3
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a1, -1912(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test3:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lui a1, 20
-; RV64I-NEXT:    addiw a1, a1, -1920
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    sw a1, 4(a0)
+; RV64I-NEXT:    sw a1, -1916(a0)
 ; RV64I-NEXT:    li a1, 3
-; RV64I-NEXT:    sw a1, 8(a0)
+; RV64I-NEXT:    sw a1, -1912(a0)
 ; RV64I-NEXT:    ret
 entry:
   %0 = bitcast [65536 x i32]* %t to i8*