[llvm] [AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init (PR #79586)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 04:20:06 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (alex-t)
<details>
<summary>Changes</summary>
---
Patch is 100.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79586.diff
24 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+47-29)
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.h (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+140-140)
- (modified) llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/call-waitcnt.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/cc-update.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df14448667..4842cf83af0e18 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,7 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
} // namespace llvm
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -398,6 +398,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
Register FlatScrInitLo;
Register FlatScrInitHi;
+ Register FlatScratchInitReg = AMDGPU::NoRegister;
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
@@ -407,7 +408,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
- Register FlatScrInit = AMDGPU::NoRegister;
+ Register FlatScratchInitReg = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
@@ -416,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
for (MCPhysReg Reg : AllSGPR64s) {
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
- FlatScrInit = Reg;
+ FlatScratchInitReg = Reg;
break;
}
}
- assert(FlatScrInit && "Failed to find free register for scratch init");
+ assert(FlatScratchInitReg && "Failed to find free register for scratch init");
- FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+ buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
@@ -440,8 +441,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
- BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
- .addReg(FlatScrInit)
+ BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+ .addReg(FlatScratchInitReg)
.addImm(EncodedOffset) // offset
.addImm(0) // cpol
.addMemOperand(MMO);
@@ -453,7 +454,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0xffff);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
} else {
- Register FlatScratchInitReg =
+ FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
assert(FlatScratchInitReg);
@@ -485,7 +486,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
addReg(FlatScrInitHi).
addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
(31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
- return;
+ return FlatScratchInitReg;
}
// For GFX9.
@@ -498,7 +499,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
- return;
+ return AMDGPU::FLAT_SCR;
}
assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -519,6 +520,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+ return AMDGPU::FLAT_SCR;
}
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -610,11 +612,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = MF.getFunction();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
+ bool NeedsFlatScratchInit =
+ MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
@@ -640,7 +646,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg;
- if (ST.isAmdHsaOrMesa(F)) {
+ if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -696,10 +702,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- bool NeedsFlatScratchInit =
- MFI->getUserSGPRInfo().hasFlatScratchInit() &&
- (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
- (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
@@ -707,22 +709,24 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
+ Register FlatScratchInit = AMDGPU::NoRegister;
if (NeedsFlatScratchInit) {
- emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+ FlatScratchInit =
+ emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
if (ScratchRsrcReg) {
- emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
- PreloadedScratchRsrcReg,
- ScratchRsrcReg, ScratchWaveOffsetReg);
+ emitEntryFunctionScratchRsrcRegSetup(
+ MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
+ PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
}
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, Register PreloadedScratchRsrcReg,
- Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+ const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -770,7 +774,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(21)
.addReg(Rsrc03);
}
- } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+ } else if (ST.isMesaGfxShader(Fn) ||
+ (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -829,11 +834,24 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
- assert(PreloadedScratchRsrcReg);
- if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
- .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ if (FlatScratchInit) {
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+ .addReg(FlatScratchInit)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+ .addImm(0xf0000000)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ return;
+ } else {
+ assert(PreloadedScratchRsrcReg);
+
+ if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811f..f706d48b2dc101 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,19 +67,19 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock::iterator MI) const override;
private:
- void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const DebugLoc &DL,
- Register ScratchWaveOffsetReg) const;
+ Register
+ emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register ScratchWaveOffsetReg) const;
Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
- Register ScratchWaveOffsetReg) const;
+ Register FlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
public:
bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6b..a7277414391cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,9 +14,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
; MUBUF-NEXT: s_mov_b32 s32, 0
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
@@ -62,8 +62,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f51ffe9c..bda25cda4c5f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[2:3], 0xf0000000
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
@@ -70,19 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c49bb805..20f60d1db7fb5f 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7541d1ec..2baaefb76acb3b 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-LABEL: name: f1
; GFX90A: bb.0.bb:
; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+ ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0
; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
- ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff935..0fe54349215ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i1_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
@@ -234,8 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
@@ -339,8 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
@@ -422,12 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/79586
More information about the llvm-commits
mailing list