[llvm] [AMDGPU] Do not allow M0 as v_readfirstlane_b32 dst (PR #128851)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 26 02:01:49 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
M0 can only be written to by the SALU, so `v_readfirstlane_b32 m0` is effectively useless. Represent this by restricting the dest RC of that instruction to `SReg_32_XM0` which excludes M0.
There is a lot of test changes due to the register class changing, but most changes are trivial. In some cases, an extra register and `s_mov_b32` is needed.
Fixes SWDEV-513269
---
Patch is 1.86 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128851.diff
149 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+3-1)
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-4)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+19-6)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+20-8)
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+1-1)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir (+2-2)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll (+52-52)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll (+50-50)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll (+44-44)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll (+44-44)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll (+392-392)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll (+52-52)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.append.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.consume.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.add.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.swap.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll (+48-48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.mov.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.f16.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.f16.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll (+96-96)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mad_64_32.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fold-readlane.mir (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll (+100-100)
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll (+100-100)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll (+173-173)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll (+53-53)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll (+163-163)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll (+173-173)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll (+53-53)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll (+163-163)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll (+45-45)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll (+50-50)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll (+123-123)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll (+45-45)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll (+50-50)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll (+123-123)
- (modified) llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/licm-valu.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/licm-wwm.mir (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll (+6-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll (+6-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll (+22-40)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll (+34-62)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll (+15-10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir (+4-5)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll (+33)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll (+10-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+59-57)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll (+56-52)
- (modified) llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+20-20)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2693ad3894cca..96c918a9a7f76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
for (unsigned i = 0; i < NumParts; ++i) {
Register SrcPart = SrcParts[i];
- Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
const TargetRegisterClass *Constrained =
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 308ab8e3b82c4..2ed313eac649e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -271,6 +271,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)
DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ac69bf6d038ec..a99fd25477553 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
+ MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+
Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC =
@@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
Result, *MRI, MI->getOperand(1), SrcRC,
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
Register PartialDst =
- MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, *Result, Result->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
.addReg(PartialSrc);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ce21f8963fe88..97736e2410c18 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -305,6 +305,7 @@ class PrologEpilogSGPRSpillBuilder {
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
TmpVGPR, FI, FrameReg, DwordOff);
+ MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);
DwordOff += 4;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index be7cdde802b51..9a0eee6b44891 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4569,7 +4569,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register PhiExec = MRI.createVirtualRegister(BoolRC);
Register NewExec = MRI.createVirtualRegister(BoolRC);
- Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurrentIdxReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
@@ -5255,18 +5256,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32;
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
- Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
.addReg(Src0.getReg());
Src0.setReg(RegOp0);
}
if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
- Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
.addReg(Src1.getReg());
Src1.setReg(RegOp1);
}
- Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
if (TRI->isVectorRegister(MRI, Src2.getReg())) {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
.addReg(Src2.getReg());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d5d54337306c0..d1670040591d9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6515,7 +6515,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
Register VScalarOp = ScalarOp->getReg();
if (NumSubRegs == 1) {
- Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
.addReg(VScalarOp);
@@ -6547,8 +6547,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
"Unhandled register size");
for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
- Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurRegLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register CurRegHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
@@ -7657,9 +7659,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
if (Inst.isCopy() && DstReg.isPhysical() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
// TODO: Only works for 32 bit registers.
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
- .add(Inst.getOperand(1));
+ if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .add(Inst.getOperand(1));
+ } else {
+ Register NewDst =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
+ .add(Inst.getOperand(1));
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
+ DstReg)
+ .addReg(NewDst);
+ }
Inst.eraseFromParent();
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5a078873679cb..aef25c73641e4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2994,10 +2994,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
- Register NewDest =
- IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
- Shift, false, 0);
+ Register NewDest;
+ if (IsCopy) {
+ MF->getRegInfo().constrainRegClass(ResultReg,
+ &AMDGPU::SReg_32_XM0RegClass);
+ NewDest = ResultReg;
+ } else {
+ NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+ Shift, false, 0);
+ }
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
.addReg(TmpResultReg);
ResultReg = NewDest;
@@ -3120,10 +3125,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(TmpResultReg);
}
- Register NewDest = IsCopy ? ResultReg
- : RS->scavengeRegisterBackwards(
- AMDGPU::SReg_32RegClass, *Add,
- false, 0, /*AllowSpill=*/true);
+ Register NewDest;
+ if (IsCopy) {
+ MF->getRegInfo().constrainRegClass(ResultReg,
+ &AMDGPU::SReg_32_XM0RegClass);
+ NewDest = ResultReg;
+ } else {
+ NewDest = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
+ /*AllowSpill=*/true);
+ }
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(TmpResultReg);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index a407ae797a48b..def06c1e9a0d7 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1
def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
- let DstRC = RegisterOperand<SReg_32>;
+ let DstRC = RegisterOperand<SReg_32_XM0>;
let Src0RC32 = VRegOrLdsSrc_32;
let Asm32 = " $vdst, $src0";
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
index 9d15b8990bad3..6e1b5d641a8b7 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir
@@ -12,7 +12,7 @@ body: |
; CHECK-NEXT: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
- %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+ %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
@@ -33,7 +33,7 @@ body: |
%4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
- %5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ %5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
S_ENDPGM 0
...
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
index 9a7e755e5f5c8..f7c874be87d36 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir
@@ -14,7 +14,7 @@ body: |
%0:vreg_64 = IMPLICIT_DEF
%1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
- %3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
+ %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec
S_ENDPGM 0
...
@@ -50,7 +50,7 @@ body: |
%1:vreg_64 = IMPLICIT_DEF
%2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
%3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
+ %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec
S_ENDPGM 0
...
@@ -104,7 +104,7 @@ body: |
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
- %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec
+ %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec
%3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
index f71f573e5a799..23931ac358843 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
@@ -104,9 +104,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -131,9 +131,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -158,9 +158,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -187,9 +187,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -298,9 +298,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/128851
More information about the llvm-commits
mailing list