[llvm] ce5c193 - [RISCV] Fold shladd into Xqcisls scaled load/store in RISCVMergeBaseOffset (#182221)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 21 02:06:41 PST 2026
Author: Sudharsan Veeravalli
Date: 2026-02-21T15:36:36+05:30
New Revision: ce5c1932e2b278de320d1e7ada3babb361c80643
URL: https://github.com/llvm/llvm-project/commit/ce5c1932e2b278de320d1e7ada3babb361c80643
DIFF: https://github.com/llvm/llvm-project/commit/ce5c1932e2b278de320d1e7ada3babb361c80643.diff
LOG: [RISCV] Fold shladd into Xqcisls scaled load/store in RISCVMergeBaseOffset (#182221)
We can fold `shxadd\qc.shladd` into base+offset load/store instructions
by transforming the load/store into `Xqcisls` scaled load/store
instructions.
For eg.
```
qc.e.li vreg1, s
shxadd vreg2, vreg3, vreg1
lx vreg4, imm(vreg2)
can be transformed to
qc.e.li vreg1, s+imm
qc.lrx vreg4, vreg1, vreg3, (1-7)
```
Such patterns are not folded during ISEL because the we prefer the
`ShlAdd` in `isWorthFoldingIntoRegRegScale` which gives us better code
in most cases.
Test case were generated by an AI which I then hand-edited to add the
negative cases.
Added:
llvm/test/CodeGen/RISCV/xqcisls-merge-base-offset-shladd.ll
Modified:
llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index be15f17e84298..d7ad4c14468ee 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -43,6 +43,7 @@ class RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
MachineInstr &TailShXAdd, Register GSReg);
bool foldIntoMemoryOps(MachineInstr &Hi, MachineInstr &Lo);
+ bool foldShxaddIntoScaledMemory(MachineInstr &Hi, MachineInstr &Lo);
RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
@@ -575,6 +576,122 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
return true;
}
+// Try to fold sequences of the form:
+// Hi/lo: qc.e.li vreg1, s -> qc.e.li vreg1, s+imm
+// TailAdd: shxadd vreg2, vreg3, vreg1 -> deleted
+// Tail: lx vreg4, imm(vreg2) -> qc.lrx vreg4, vreg1, vreg3, (1-7)
+bool RISCVMergeBaseOffsetOpt::foldShxaddIntoScaledMemory(MachineInstr &Hi,
+ MachineInstr &Lo) {
+ if (!ST->hasVendorXqcisls() || ST->is64Bit())
+ return false;
+
+ if (Hi.getOpcode() != RISCV::QC_E_LI)
+ return false;
+
+ Register BaseReg = Hi.getOperand(0).getReg();
+ if (!BaseReg.isVirtual() || !MRI->hasOneUse(BaseReg))
+ return false;
+
+ MachineInstr &ShxAdd = *MRI->use_instr_begin(BaseReg);
+ unsigned ShxOpc = ShxAdd.getOpcode();
+ unsigned ShAmt = 0;
+ switch (ShxOpc) {
+ default:
+ return false;
+ case RISCV::SH1ADD:
+ ShAmt = 1;
+ break;
+ case RISCV::SH2ADD:
+ ShAmt = 2;
+ break;
+ case RISCV::SH3ADD:
+ ShAmt = 3;
+ break;
+ case RISCV::QC_SHLADD:
+ uint8_t ShlImm = ShxAdd.getOperand(3).getImm();
+ if (ShlImm > 7)
+ return false;
+ ShAmt = ShlImm;
+ break;
+ }
+
+ // shxadd Rd, Rs1, Rs2
+ Register ScaledReg = ShxAdd.getOperand(0).getReg();
+ Register IndexReg = ShxAdd.getOperand(1).getReg();
+
+ if (!IndexReg.isVirtual())
+ return false;
+
+ if (ShxAdd.getOperand(2).getReg() != BaseReg)
+ return false;
+
+ if (!ScaledReg.isVirtual() || !MRI->hasOneUse(ScaledReg))
+ return false;
+
+ MachineInstr &TailMem = *MRI->use_instr_begin(ScaledReg);
+ unsigned Opc = TailMem.getOpcode();
+ unsigned NewOpc = 0;
+
+ switch (Opc) {
+ case RISCV::LB:
+ NewOpc = RISCV::QC_LRB;
+ break;
+ case RISCV::LBU:
+ NewOpc = RISCV::QC_LRBU;
+ break;
+ case RISCV::LH:
+ NewOpc = RISCV::QC_LRH;
+ break;
+ case RISCV::LHU:
+ NewOpc = RISCV::QC_LRHU;
+ break;
+ case RISCV::LW:
+ NewOpc = RISCV::QC_LRW;
+ break;
+ case RISCV::SB:
+ NewOpc = RISCV::QC_SRB;
+ break;
+ case RISCV::SH:
+ NewOpc = RISCV::QC_SRH;
+ break;
+ case RISCV::SW:
+ NewOpc = RISCV::QC_SRW;
+ break;
+ default:
+ return false;
+ }
+
+ if (!TailMem.getOperand(1).isReg() ||
+ TailMem.getOperand(1).getReg() != ScaledReg)
+ return false;
+ if (!TailMem.getOperand(2).isImm())
+ return false;
+ int64_t Imm = TailMem.getOperand(2).getImm();
+
+ // Update QC_E_LI offset.
+ int64_t NewOffset = SignExtend64<32>(Hi.getOperand(1).getOffset() + Imm);
+
+ Hi.getOperand(1).setOffset(NewOffset);
+
+ // Build scaled load/store.
+ auto *TII = ST->getInstrInfo();
+ auto *MBB = TailMem.getParent();
+
+ // Ensure index register satisfies GPRNoX0 class required by QC_LR*/QC_SR*.
+ MRI->constrainRegClass(IndexReg, &RISCV::GPRNoX0RegClass);
+
+ BuildMI(*MBB, TailMem, TailMem.getDebugLoc(), TII->get(NewOpc))
+ .add(TailMem.getOperand(0))
+ .addReg(BaseReg, getKillRegState(ShxAdd.getOperand(2).isKill()))
+ .addReg(IndexReg, getKillRegState(ShxAdd.getOperand(1).isKill()))
+ .addImm(ShAmt)
+ .cloneMemRefs(TailMem);
+
+ TailMem.eraseFromParent();
+ ShxAdd.eraseFromParent();
+ return true;
+}
+
bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;
@@ -591,6 +708,7 @@ bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
continue;
MadeChange |= detectAndFoldOffset(Hi, *Lo);
MadeChange |= foldIntoMemoryOps(Hi, *Lo);
+ MadeChange |= foldShxaddIntoScaledMemory(Hi, *Lo);
}
}
diff --git a/llvm/test/CodeGen/RISCV/xqcisls-merge-base-offset-shladd.ll b/llvm/test/CodeGen/RISCV/xqcisls-merge-base-offset-shladd.ll
new file mode 100644
index 0000000000000..fae3bfb35213f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcisls-merge-base-offset-shladd.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+xqcili,+xqcisls,+zba,+xqciac -verify-machineinstrs %s -o - | FileCheck %s
+
+ at sym = external global i8, align 1
+
+define i32 @load_scaled(i32 %idx) nounwind {
+; CHECK-LABEL: load_scaled:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a1, sym+16
+; CHECK-NEXT: qc.lrw a0, a1, a0, 2
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 2
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i8*
+ %addr_plus_imm = getelementptr i8, i8* %sum_ptr, i32 16
+ %addr_i32p = bitcast i8* %addr_plus_imm to i32*
+ %val = load i32, i32* %addr_i32p, align 4
+ ret i32 %val
+}
+
+define void @store_scaled(i32 %idx, i32 %val) nounwind {
+; CHECK-LABEL: store_scaled:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a2, sym+12
+; CHECK-NEXT: qc.srw a1, a2, a0, 3
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 3
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i8*
+ %addr_plus_imm = getelementptr i8, i8* %sum_ptr, i32 12
+ %addr_i32p = bitcast i8* %addr_plus_imm to i32*
+ store i32 %val, i32* %addr_i32p, align 4
+ ret void
+}
+
+define i32 @load_scaled_i8(i32 %idx) nounwind {
+; CHECK-LABEL: load_scaled_i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a1, sym+5
+; CHECK-NEXT: qc.lrb a0, a1, a0, 2
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 2
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i8*
+ %addr_plus_imm = getelementptr i8, i8* %sum_ptr, i32 5
+ %val8 = load i8, i8* %addr_plus_imm, align 1
+ %val = sext i8 %val8 to i32
+ ret i32 %val
+}
+
+define void @store_scaled_i8(i32 %idx, i32 %val) nounwind {
+; CHECK-LABEL: store_scaled_i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a2, sym+7
+; CHECK-NEXT: qc.srb a1, a2, a0, 3
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 3
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i8*
+ %addr_plus_imm = getelementptr i8, i8* %sum_ptr, i32 7
+ %val8 = trunc i32 %val to i8
+ store i8 %val8, i8* %addr_plus_imm, align 1
+ ret void
+}
+
+define i32 @load_scaled_i16(i32 %idx) nounwind {
+; CHECK-LABEL: load_scaled_i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a1, sym+6
+; CHECK-NEXT: qc.lrh a0, a1, a0, 1
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 1
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i16*
+ %addr_plus_imm = getelementptr i16, i16* %sum_ptr, i32 3
+ %val16 = load i16, i16* %addr_plus_imm, align 2
+ %val = sext i16 %val16 to i32
+ ret i32 %val
+}
+
+define void @store_scaled_i16(i32 %idx, i32 %val) nounwind {
+; CHECK-LABEL: store_scaled_i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a2, sym+10
+; CHECK-NEXT: qc.srh a1, a2, a0, 2
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 2
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i16*
+ %addr_plus_imm = getelementptr i16, i16* %sum_ptr, i32 5
+ %val16 = trunc i32 %val to i16
+ store i16 %val16, i16* %addr_plus_imm, align 2
+ ret void
+}
+
+define void @no_store_scaled_i16_multiuse_base_reg(i32 %idx, i32 %val) nounwind {
+; CHECK-LABEL: no_store_scaled_i16_multiuse_base_reg:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a2, sym
+; CHECK-NEXT: sh2add a0, a0, a2
+; CHECK-NEXT: sh a1, 10(a0)
+; CHECK-NEXT: sh a1, 14(a0)
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 2
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i16*
+ %addr_plus_imm = getelementptr i16, i16* %sum_ptr, i32 5
+ %addr_plus_imm2 = getelementptr i16, i16* %sum_ptr, i32 7
+ %val16 = trunc i32 %val to i16
+ store i16 %val16, i16* %addr_plus_imm, align 2
+ store i16 %val16, i16* %addr_plus_imm2, align 2
+ ret void
+}
+
+define void @no_store_scaled_i8_multiuse_scaled_reg(i32 %idx, i32 %val) nounwind {
+; CHECK-LABEL: no_store_scaled_i8_multiuse_scaled_reg:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a2, sym
+; CHECK-NEXT: sh3add a0, a0, a2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: sb a1, 7(a0)
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 3
+ %sum = add i32 %baseint, %idxsh
+ %sum2 = add i32 %sum, %val
+ %sum_ptr = inttoptr i32 %sum2 to i8*
+ %addr_plus_imm = getelementptr i8, i8* %sum_ptr, i32 7
+ %val8 = trunc i32 %val to i8
+ store i8 %val8, i8* %addr_plus_imm, align 1
+ ret void
+}
+
+define i32 @load_scaled_i16_shift_gt_3(i32 %idx) nounwind {
+; CHECK-LABEL: load_scaled_i16_shift_gt_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a1, sym+6
+; CHECK-NEXT: qc.lrh a0, a1, a0, 5
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 5
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i16*
+ %addr_plus_imm = getelementptr i16, i16* %sum_ptr, i32 3
+ %val16 = load i16, i16* %addr_plus_imm, align 2
+ %val = sext i16 %val16 to i32
+ ret i32 %val
+}
+
+define i32 @no_load_scaled_i16_shift_gt_7(i32 %idx) nounwind {
+; CHECK-LABEL: no_load_scaled_i16_shift_gt_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: qc.e.li a1, sym
+; CHECK-NEXT: qc.shladd a0, a0, a1, 8
+; CHECK-NEXT: lh a0, 6(a0)
+; CHECK-NEXT: ret
+entry:
+ %baseptr = getelementptr i8, i8* @sym, i32 0
+ %baseint = ptrtoint i8* %baseptr to i32
+ %idxsh = shl i32 %idx, 8
+ %sum = add i32 %baseint, %idxsh
+ %sum_ptr = inttoptr i32 %sum to i16*
+ %addr_plus_imm = getelementptr i16, i16* %sum_ptr, i32 3
+ %val16 = load i16, i16* %addr_plus_imm, align 2
+ %val = sext i16 %val16 to i32
+ ret i32 %val
+}
More information about the llvm-commits
mailing list