[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Include constrained buffer load variants (PR #101619)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Aug 1 23:35:14 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Christudasan Devadasan (cdevadas)
<details>
<summary>Changes</summary>
Use the constrained buffer load opcodes while combining under-aligned
load for XNACK enabled subtargets.
---
Patch is 47.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101619.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+63-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+38-18)
- (modified) llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir (+512-52)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae537b194f50c..7553c370f694f 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
@@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}
+static bool needsConstraintedOpcode(const GCNSubtarget &STM,
+ const MachineMemOperand *MMO,
+ unsigned Width) {
+ return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+}
+
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1734,51 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
- case S_BUFFER_LOAD_IMM:
+ case S_BUFFER_LOAD_IMM: {
+ const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
- case S_BUFFER_LOAD_SGPR_IMM:
+ }
+ case S_BUFFER_LOAD_SGPR_IMM: {
+ const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
+ }
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc =
- STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 074489b9ff505..d085b3c768a86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm
; GFX67-NEXT: s_endpgm
;
-; GFX8910-LABEL: s_buffer_load_imm_mergex2:
-; GFX8910: ; %bb.0: ; %main_body
-; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
-; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8910-NEXT: v_mov_b32_e32 v0, s0
-; GFX8910-NEXT: v_mov_b32_e32 v1, s1
-; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
-; GFX8910-NEXT: s_endpgm
+; GFX8-LABEL: s_buffer_load_imm_mergex2:
+; GFX8: ; %bb.0: ; %main_body
+; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm
+; GFX8-NEXT: s_endpgm
+;
+; GFX910-LABEL: s_buffer_load_imm_mergex2:
+; GFX910: ; %bb.0: ; %main_body
+; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4
+; GFX910-NEXT: s_waitcnt lgkmcnt(0)
+; GFX910-NEXT: v_mov_b32_e32 v0, s4
+; GFX910-NEXT: v_mov_b32_e32 v1, s5
+; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
+; GFX910-NEXT: s_endpgm
;
; GFX11-LABEL: s_buffer_load_imm_mergex2:
; GFX11: ; %bb.0: ; %main_body
@@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX67-NEXT: s_endpgm
;
-; GFX8910-LABEL: s_buffer_load_imm_mergex4:
-; GFX8910: ; %bb.0: ; %main_body
-; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
-; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8910-NEXT: v_mov_b32_e32 v0, s0
-; GFX8910-NEXT: v_mov_b32_e32 v1, s1
-; GFX8910-NEXT: v_mov_b32_e32 v2, s2
-; GFX8910-NEXT: v_mov_b32_e32 v3, s3
-; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
-; GFX8910-NEXT: s_endpgm
+; GFX8-LABEL: s_buffer_load_imm_mergex4:
+; GFX8: ; %bb.0: ; %main_body
+; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; GFX8-NEXT: s_endpgm
+;
+; GFX910-LABEL: s_buffer_load_imm_mergex4:
+; GFX910: ; %bb.0: ; %main_body
+; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8
+; GFX910-NEXT: s_waitcnt lgkmcnt(0)
+; GFX910-NEXT: v_mov_b32_e32 v0, s4
+; GFX910-NEXT: v_mov_b32_e32 v1, s5
+; GFX910-NEXT: v_mov_b32_e32 v2, s6
+; GFX910-NEXT: v_mov_b32_e32 v3, s7
+; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; GFX910-NEXT: s_endpgm
;
; GFX11-LABEL: s_buffer_load_imm_mergex4:
; GFX11: ; %bb.0: ; %main_body
diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
index f8502091f8b78..02c1a328f4825 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
@@ -9,14 +9,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x2
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1
- ; CHECK-NEXT: S_ENDPGM 0
+ ; GFX10-LABEL: name: merge_s_buffer_load_x2
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x2
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1
+ ; GFX12-NEXT: S_ENDPGM 0
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
@@ -86,9 +95,9 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
@@ -170,9 +179,9 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
@@ -231,9 +240,9 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub0_sub1_sub2_sub3
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub1
@@ -288,18 +297,31 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
- ; CHECK-NEXT: S_ENDPGM 0
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: S_ENDPGM 0
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
%2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
@@ -316,14 +338,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_B...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/101619
More information about the llvm-branch-commits
mailing list