[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 20 03:37:50 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel
Author: Christudasan Devadasan (cdevadas)
<details>
<summary>Changes</summary>
Consider the constrained multi-dword loads while merging
individual loads to a single multi-dword load.
---
Patch is 1023.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96162.diff
116 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+1)
- (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+63-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+84-85)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+51-51)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+50-50)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+102-102)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+65-65)
- (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+204-204)
- (modified) llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/bfe-patterns.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/bfm.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/bitreverse.ll (+157-157)
- (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/cluster_stores.ll (+60-60)
- (modified) llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+227-227)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+223-223)
- (modified) llvm/test/CodeGen/AMDGPU/ctpop16.ll (+52-48)
- (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+185-185)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+212-212)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+331-334)
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+31-32)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+33-33)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+143-143)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+56-56)
- (modified) llvm/test/CodeGen/AMDGPU/fma-combine.ll (+88-88)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll (+240-240)
- (modified) llvm/test/CodeGen/AMDGPU/fnearbyint.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+23-23)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll (+33-33)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll (+46-46)
- (modified) llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll (+38-38)
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_sint.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_uint.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+44-44)
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+98-98)
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+41-41)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+88-88)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+124-124)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+53-53)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll (+50-50)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+184-208)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll (+84-84)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+66-66)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll (+122-122)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll (+41-41)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/madak.ll (+92-92)
- (modified) llvm/test/CodeGen/AMDGPU/memory_clause.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/merge-s-load.mir (+144-36)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll (+80-80)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+18-11)
- (modified) llvm/test/CodeGen/AMDGPU/mul_int24.ll (+38-38)
- (modified) llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll (+55-55)
- (modified) llvm/test/CodeGen/AMDGPU/or.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/packed-op-sel.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/shl.v2i16.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/sign_extend.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+56-56)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/xor.ll (+5-5)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f22389..19d5b950d7142 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -967,6 +967,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
+ bool hasXnackReplay() const { return GFX8Insts; }
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 8b42d4a1dee7a..0b285d52b539e 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
CombineInfo &Paired, bool Modify = false);
static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
const CombineInfo &Paired);
- static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
+ static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired,
+ const GCNSubtarget *STI = nullptr);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
const CombineInfo &Paired);
const TargetRegisterClass *
@@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
case AMDGPU::GLOBAL_STORE_DWORD:
@@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return S_LOAD_IMM;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_LOAD_DWORD_IMM;
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
Result.SBase = true;
return Result;
case AMDGPU::DS_READ_B32:
@@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs(
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
- const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
- const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+ auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
+ auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+
+ // The constrained sload instructions in S_LOAD_IMM class will have
+ // `early-clobber` flag in the dst operand. Remove the flag before using the
+ // MOs in copies.
+ if (Dest0->isEarlyClobber())
+ Dest0->setIsEarlyClobber(false);
+
+ if (Dest1->isEarlyClobber())
+ Dest1->setIsEarlyClobber(false);
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI, Paired);
+ const unsigned Opcode = getNewOpcode(CI, Paired, STM);
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
}
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
- const CombineInfo &Paired) {
+ const CombineInfo &Paired,
+ const GCNSubtarget *STI) {
const unsigned Width = CI.Width + Paired.Width;
switch (getCommonInstClass(CI, Paired)) {
@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
case S_LOAD_IMM:
- switch (Width) {
- default:
- return 0;
- case 2:
- return AMDGPU::S_LOAD_DWORDX2_IMM;
- case 3:
- return AMDGPU::S_LOAD_DWORDX3_IMM;
- case 4:
- return AMDGPU::S_LOAD_DWORDX4_IMM;
- case 8:
- return AMDGPU::S_LOAD_DWORDX8_IMM;
+ // For targets that support XNACK replay, use the constrained load opcode.
+ if (STI && STI->hasXnackReplay()) {
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
+ case 4:
+ return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
+ case 8:
+ return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
+ }
+ } else {
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::S_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM;
+ case 4:
+ return AMDGPU::S_LOAD_DWORDX4_IMM;
+ case 8:
+ return AMDGPU::S_LOAD_DWORDX8_IMM;
+ }
}
case GLOBAL_LOAD:
switch (Width) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index eb20178f9f4d8..3f034eaca4997 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -468,18 +468,18 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
;
; VI-LABEL: load_i8_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -513,16 +513,16 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v2i8_to_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v1, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -562,16 +562,16 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v3i8_to_v3f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -612,16 +612,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v4i8_to_v4f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -679,11 +679,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
;
; VI-LABEL: load_v4i8_to_v4f32_unaligned:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
@@ -706,12 +706,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v1, v2, v3
; VI-NEXT: v_or_b32_e32 v3, v1, v0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -770,6 +770,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v6, 9
; VI-NEXT: v_mov_b32_e32 v7, 8
@@ -779,11 +780,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xff
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -804,8 +803,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v2
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -858,11 +857,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v7i8_to_v7f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
@@ -884,10 +883,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI-NEXT: flat_load_ubyte v4, v[8:9]
; VI-NEXT: flat_load_ubyte v5, v[10:11]
; VI-NEXT: flat_load_ubyte v6, v[12:13]
-; VI-NEXT: v_mov_b32_e32 v8, s1
-; VI-NEXT: v_mov_b32_e32 v7, s0
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v8, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v7, s4
; VI-NEXT: v_mov_b32_e32 v10, s1
; VI-NEXT: v_mov_b32_e32 v9, s0
; VI-NEXT: s_waitcnt vmcnt(6)
@@ -949,18 +948,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v8i8_to_v8f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v9, s1
-; VI-NEXT: v_mov_b32_e32 v8, s0
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v9, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v8, s4
; VI-NEXT: v_mov_b32_e32 v11, s1
; VI-NEXT: v_mov_b32_e32 v10, s0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1005,19 +1004,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
;
; VI-LABEL: i8_zext_inreg_i32_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1051,18 +1050,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
;
; VI-LABEL: i8_zext_inreg_hi1_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1096,18 +1095,18 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: i8_zext_i32_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1157,11 +1156,1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/96162
More information about the llvm-branch-commits
mailing list