[llvm-branch-commits] [llvm] [WIP] AMDGPU: Handle v_add* in eliminateFrameIndex (PR #102346)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Aug 7 11:07:37 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
---
Patch is 297.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102346.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+218-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+34-42)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll (+6-1)
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir (+333-584)
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir (+171-221)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/frame-index.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll (+7-3)
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 76da1f0eb4f7d..81337c62ffe17 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2086,7 +2086,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
"unreserved scratch RSRC register");
- MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+ MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2268,6 +2268,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
+ case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_U32_e64:
+ case AMDGPU::V_ADD_CO_U32_e32:
+ case AMDGPU::V_ADD_CO_U32_e64: {
+ // TODO: Handle sub, and, or.
+ unsigned NumDefs = MI->getNumExplicitDefs();
+ unsigned Src0Idx = NumDefs;
+
+ bool HasClamp = false;
+ MachineOperand *VCCOp = nullptr;
+
+ switch (MI->getOpcode()) {
+ case AMDGPU::V_ADD_U32_e32:
+ break;
+ case AMDGPU::V_ADD_U32_e64:
+ HasClamp = MI->getOperand(3).getImm();
+ break;
+ case AMDGPU::V_ADD_CO_U32_e32:
+ VCCOp = &MI->getOperand(3);
+ break;
+ case AMDGPU::V_ADD_CO_U32_e64:
+ VCCOp = &MI->getOperand(1);
+ HasClamp = MI->getOperand(4).getImm();
+ break;
+ default:
+ break;
+ }
+ bool DeadVCC = !VCCOp || VCCOp->isDead();
+ MachineOperand &DstOp = MI->getOperand(0);
+ Register DstReg = DstOp.getReg();
+
+ unsigned OtherOpIdx =
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
+ MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
+
+ unsigned Src1Idx = Src0Idx + 1;
+ Register MaterializedReg = FrameReg;
+ Register ScavengedVGPR;
+
+ if (FrameReg && !ST.enableFlatScratch()) {
+ // We should just do an in-place update of the result register. However,
+ // the value there may also be used by the add, in which case we need a
+ // temporary register.
+ //
+ // FIXME: The scavenger is not finding the result register in the
+ // common case where the add does not read the register.
+
+ ScavengedVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
+
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
+ // shift.
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
+ .addDef(ScavengedVGPR, RegState::Renamable)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+ MaterializedReg = ScavengedVGPR;
+ }
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ // For the non-immediate case, we could fall through to the default
+ // handling, but we do an in-place update of the result register here to
+ // avoid scavenging another register.
+ if (OtherOp->isImm()) {
+ OtherOp->setImm(OtherOp->getImm() + Offset);
+ Offset = 0;
+ }
+
+ if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
+ if (ST.enableFlatScratch() &&
+ !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
+ // We didn't need the shift above, so we have an SGPR for the frame
+ // register, but may have a VGPR only operand.
+ //
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
+ // and use the higher constant bus restriction to avoid this copy.
+
+ if (!ScavengedVGPR) {
+ ScavengedVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
+ /*SPAdj=*/0);
+ }
+
+ assert(ScavengedVGPR != DstReg);
+
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
+ .addReg(MaterializedReg,
+ MaterializedReg != FrameReg ? RegState::Kill : 0);
+ MaterializedReg = ScavengedVGPR;
+ }
+
+ auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
+ .addDef(DstReg, RegState::Renamable);
+ if (NumDefs == 2)
+ AddI32.add(MI->getOperand(1));
+
+ unsigned MaterializedRegFlags =
+ MaterializedReg != FrameReg ? RegState::Kill : 0;
+
+ if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
+ // If we know we have a VGPR already, it's more likely the other
+ // operand is a legal vsrc0.
+ AddI32
+ .add(*OtherOp)
+ .addReg(MaterializedReg, MaterializedRegFlags);
+ } else {
+ // Commute operands to avoid violating VOP2 restrictions. This will
+ // typically happen when using scratch.
+ AddI32
+ .addReg(MaterializedReg, MaterializedRegFlags)
+ .add(*OtherOp);
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
+ MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
+ AddI32.addImm(0); // clamp
+
+ if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
+ AddI32.setOperandDead(3); // Dead vcc
+
+ MaterializedReg = DstReg;
+
+ OtherOp->ChangeToRegister(MaterializedReg, false);
+ OtherOp->setIsKill(true);
+ FIOp->ChangeToImmediate(Offset);
+ Offset = 0;
+ } else if (Offset != 0) {
+ assert(!MaterializedReg);
+ FIOp->ChangeToImmediate(Offset);
+ Offset = 0;
+ } else {
+ if (DeadVCC && !HasClamp) {
+ assert(Offset == 0);
+
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
+ // let lowerCopy deal with it?
+ if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
+ // Folded to an identity copy.
+ MI->eraseFromParent();
+ return true;
+ }
+
+ // The immediate value should be in OtherOp
+ MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+ MI->removeOperand(FIOperandNum);
+
+ unsigned NumOps = MI->getNumOperands();
+ for (unsigned I = NumOps - 2; I >= 2; --I)
+ MI->removeOperand(I);
+
+ if (NumDefs == 2)
+ MI->removeOperand(1);
+
+ // The code below can't deal with a mov.
+ return true;
+ }
+
+ // This folded to a constant, but we have to keep the add around for
+ // pointless implicit defs or clamp modifier.
+ FIOp->ChangeToImmediate(0);
+ }
+
+ // Try to improve legality by commuting.
+ if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
+ std::swap(FIOp, OtherOp);
+ std::swap(FIOperandNum, OtherOpIdx);
+ }
+
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
+ // Depending on operand constraints we may need to insert another copy.
+ if (!TII->isOperandLegal(*MI, SrcIdx)) {
+ // If commuting didn't make the operands legal, we need to materialize
+ // in a register.
+ // TODO: Can use SGPR on gfx10+ in some cases.
+ if (!ScavengedVGPR) {
+ ScavengedVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
+ /*SPAdj=*/0);
+ }
+
+ assert(ScavengedVGPR != DstReg);
+
+ MachineOperand &Src = MI->getOperand(SrcIdx);
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
+ .add(Src);
+
+ Src.ChangeToRegister(ScavengedVGPR, false);
+ Src.setIsKill(true);
+ }
+ }
+
+ // Fold out add of 0 case that can appear in kernels.
+ if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
+ if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
+ }
+
+ MI->eraseFromParent();
+ }
+
+ return true;
+ }
case AMDGPU::S_ADD_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_AND_B32: {
@@ -2336,7 +2538,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
} else {
if (MaterializedReg)
OtherOp.ChangeToRegister(MaterializedReg, false);
- FIOp.ChangeToImmediate(NewOffset);
+ FIOp->ChangeToImmediate(NewOffset);
}
return true;
@@ -2354,7 +2556,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// The offset is always swizzled, just replace it
if (FrameReg)
- FIOp.ChangeToRegister(FrameReg, false);
+ FIOp->ChangeToRegister(FrameReg, false);
MachineOperand *OffsetOp =
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2407,18 +2609,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
if (!FrameReg) {
- FIOp.ChangeToImmediate(Offset);
- if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
+ FIOp->ChangeToImmediate(Offset);
+ if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
return false;
}
// We need to use register here. Check if we can use an SGPR or need
// a VGPR.
- FIOp.ChangeToRegister(AMDGPU::M0, false);
- bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
+ FIOp->ChangeToRegister(AMDGPU::M0, false);
+ bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
if (!Offset && FrameReg && UseSGPR) {
- FIOp.setReg(FrameReg);
+ FIOp->setReg(FrameReg);
return false;
}
@@ -2427,8 +2629,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Register TmpReg =
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
- FIOp.setReg(TmpReg);
- FIOp.setIsKill();
+ FIOp->setReg(TmpReg);
+ FIOp->setIsKill();
if ((!FrameReg || !Offset) && TmpReg) {
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2457,8 +2659,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (!TmpSReg) {
// Use frame register and restore it after.
TmpSReg = FrameReg;
- FIOp.setReg(FrameReg);
- FIOp.setIsKill(false);
+ FIOp->setReg(FrameReg);
+ FIOp->setIsKill(false);
}
if (NeedSaveSCC) {
@@ -2706,7 +2908,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
- FIOp.ChangeToRegister(ResultReg, false, false, true);
+ FIOp->ChangeToRegister(ResultReg, false, false, true);
return false;
}
@@ -2737,13 +2939,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// If the offset is simply too big, don't convert to a scratch wave offset
// relative index.
- FIOp.ChangeToImmediate(Offset);
- if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
+ FIOp->ChangeToImmediate(Offset);
+ if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
MI, false, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
- FIOp.ChangeToRegister(TmpReg, false, false, true);
+ FIOp->ChangeToRegister(TmpReg, false, false, true);
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 83f2329feb8f2..e2eac156ea787 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -105,15 +105,13 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-LABEL: store_load_vindex_kernel:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GFX9-NEXT: v_add_u32_e32 v1, 0, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
@@ -128,8 +126,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
@@ -140,12 +136,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX940-NEXT: v_mov_b32_e32 v2, 15
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
@@ -160,7 +155,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1
; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
@@ -539,15 +533,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX9-NEXT: scratch_load_dword v1, off, s32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x100
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v1, s32, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x100
+; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v0, s32, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 0x100, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -557,14 +551,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x100
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x100
-; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
@@ -577,8 +571,8 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: s_add_i32 s0, s32, 0x100
-; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
+; GFX940-NEXT: v_add_u32_e32 v1, s32, v1
+; GFX940-NEXT: v_add_u32_e32 v1, 0x100, v1
; GFX940-NEXT: v_mov_b32_e32 v2, 15
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -593,11 +587,12 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x100
; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v1, s32, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1
; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v0, s32 offset:256 glc dlc
@@ -855,15 +850,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v1, s32, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
+; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v0, s32, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -873,14 +868,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
@@ -893,8 +888,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
+; GFX940-NEXT: v_add_u32_e32 v1, s32, v1
+; GFX940-NEXT: v_add_u32_e32 v1, 0x4004, v1
; GFX940-NEXT: v_mov_b32_e32 v2, 15
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -913,9 +908,10 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
; GFX11-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102346
More information about the llvm-branch-commits
mailing list