[llvm] [AMDGPU][SILoadStoreOptimizer] Include constrained buffer load variants (PR #101619)
Christudasan Devadasan via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 5 22:50:35 PDT 2024
https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/101619
>From f28301eb377d0556c3563100c5a4e5562b438e33 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 30 Jul 2024 14:46:36 +0530
Subject: [PATCH 1/4] [AMDGPU][SILoadStoreOptimizer] Include constrained buffer
load variants
Use the constrained buffer load opcodes while combining under-aligned
load for XNACK enabled subtargets.
---
.../Target/AMDGPU/SILoadStoreOptimizer.cpp | 75 ++-
.../AMDGPU/llvm.amdgcn.s.buffer.load.ll | 56 +-
.../CodeGen/AMDGPU/merge-sbuffer-load.mir | 564 ++++++++++++++++--
3 files changed, 613 insertions(+), 82 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae537b194f50c..7553c370f694f 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
@@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}
+static bool needsConstraintedOpcode(const GCNSubtarget &STM,
+ const MachineMemOperand *MMO,
+ unsigned Width) {
+ return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+}
+
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1734,51 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
- case S_BUFFER_LOAD_IMM:
+ case S_BUFFER_LOAD_IMM: {
+ const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
- case S_BUFFER_LOAD_SGPR_IMM:
+ }
+ case S_BUFFER_LOAD_SGPR_IMM: {
+ const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
+ }
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc =
- STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 074489b9ff505..d085b3c768a86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm
; GFX67-NEXT: s_endpgm
;
-; GFX8910-LABEL: s_buffer_load_imm_mergex2:
-; GFX8910: ; %bb.0: ; %main_body
-; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
-; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8910-NEXT: v_mov_b32_e32 v0, s0
-; GFX8910-NEXT: v_mov_b32_e32 v1, s1
-; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
-; GFX8910-NEXT: s_endpgm
+; GFX8-LABEL: s_buffer_load_imm_mergex2:
+; GFX8: ; %bb.0: ; %main_body
+; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm
+; GFX8-NEXT: s_endpgm
+;
+; GFX910-LABEL: s_buffer_load_imm_mergex2:
+; GFX910: ; %bb.0: ; %main_body
+; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4
+; GFX910-NEXT: s_waitcnt lgkmcnt(0)
+; GFX910-NEXT: v_mov_b32_e32 v0, s4
+; GFX910-NEXT: v_mov_b32_e32 v1, s5
+; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
+; GFX910-NEXT: s_endpgm
;
; GFX11-LABEL: s_buffer_load_imm_mergex2:
; GFX11: ; %bb.0: ; %main_body
@@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX67-NEXT: s_endpgm
;
-; GFX8910-LABEL: s_buffer_load_imm_mergex4:
-; GFX8910: ; %bb.0: ; %main_body
-; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
-; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8910-NEXT: v_mov_b32_e32 v0, s0
-; GFX8910-NEXT: v_mov_b32_e32 v1, s1
-; GFX8910-NEXT: v_mov_b32_e32 v2, s2
-; GFX8910-NEXT: v_mov_b32_e32 v3, s3
-; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
-; GFX8910-NEXT: s_endpgm
+; GFX8-LABEL: s_buffer_load_imm_mergex4:
+; GFX8: ; %bb.0: ; %main_body
+; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; GFX8-NEXT: s_endpgm
+;
+; GFX910-LABEL: s_buffer_load_imm_mergex4:
+; GFX910: ; %bb.0: ; %main_body
+; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8
+; GFX910-NEXT: s_waitcnt lgkmcnt(0)
+; GFX910-NEXT: v_mov_b32_e32 v0, s4
+; GFX910-NEXT: v_mov_b32_e32 v1, s5
+; GFX910-NEXT: v_mov_b32_e32 v2, s6
+; GFX910-NEXT: v_mov_b32_e32 v3, s7
+; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; GFX910-NEXT: s_endpgm
;
; GFX11-LABEL: s_buffer_load_imm_mergex4:
; GFX11: ; %bb.0: ; %main_body
diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
index f8502091f8b78..02c1a328f4825 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
@@ -9,14 +9,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x2
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1
- ; CHECK-NEXT: S_ENDPGM 0
+ ; GFX10-LABEL: name: merge_s_buffer_load_x2
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x2
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1
+ ; GFX12-NEXT: S_ENDPGM 0
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
@@ -86,9 +95,9 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
@@ -170,9 +179,9 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
@@ -231,9 +240,9 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub0_sub1_sub2_sub3
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub1
@@ -288,18 +297,31 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
- ; CHECK-NEXT: S_ENDPGM 0
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: S_ENDPGM 0
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
%2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
@@ -316,14 +338,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; CHECK-NEXT: S_ENDPGM 0
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: S_ENDPGM 0
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
%2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
@@ -338,18 +369,31 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-LABEL: name: merge_s_buffer_load_x8_mixed
- ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
- ; CHECK-NEXT: S_ENDPGM 0
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_mixed
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_mixed
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
+ ; GFX12-NEXT: S_ENDPGM 0
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
@@ -371,9 +415,9 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub0_sub1
- ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub2_sub3
+ ; GFX10-NEXT: early-clobber %8:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY %8.sub0_sub1
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed %8.sub2_sub3
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
@@ -450,4 +494,420 @@ body: |
S_ENDPGM 0
...
+
+# The constrained multi-dword buffer load merge tests.
+
+---
+name: merge_s_buffer_load_x1_x2ec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: merge_s_buffer_load_x1_x2ec
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 4, 0 :: (dereferenceable invariant load (s64))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x2ec_x1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x2ec_x1
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s64))
+ ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 8, 0 :: (dereferenceable invariant load (s32))
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x2ec_x1
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX3_IMM]].sub0_sub1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX3_IMM]].sub2
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x1_x3ec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: merge_s_buffer_load_x1_x3ec
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM_ec [[COPY]], 4, 0 :: (dereferenceable invariant load (s96), align 16)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ early-clobber %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM_ec %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s96))
+
+ S_ENDPGM 0
+...
---
+
+name: merge_s_buffer_load_x3ec_x1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: merge_s_buffer_load_x3ec_x1
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128))
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub3
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s96))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x8_out_of_x2ec_reordered
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_reordered
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_reordered
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
+ early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
+ early-clobber %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
+ early-clobber %4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x8_out_of_x2ec_x2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_x2
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_x2
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
+ early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
+ %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
+ %4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x8_out_of_x4ec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
+ early-clobber %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x8_out_of_x4ec_x4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec_x4
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec_x4
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
+ %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x8_out_of_x4_x4ec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4_x4ec
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4_x4ec
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
+ early-clobber %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x8_mixed_including_ec_opcodes
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_x8_mixed_including_ec_opcodes
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_x8_mixed_including_ec_opcodes
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ early-clobber %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ early-clobber %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
+ %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+
+ ; GFX10-LABEL: name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec
+ ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX10-NEXT: early-clobber %4:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY %4.sub0_sub1
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed %4.sub2_sub3
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub0_sub1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub2_sub3
+ ; GFX12-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32 = COPY $sgpr4
+ early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec %0:sgpr_128, %1:sreg_32, 0, 0 :: (dereferenceable invariant load (s64))
+ early-clobber %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec %0:sgpr_128, %1:sreg_32, 8, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+
+# No constrained opcode required when the MEM operand has met the required alignment.
+
+---
+
+name: merge_s_buffer_load_x2_x2_no_constrained_opc_needed
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: merge_s_buffer_load_x2_x2_no_constrained_opc_needed
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128))
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64), align 16)
+ %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_x4_x4_no_constrained_opc_needed
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: merge_s_buffer_load_x4_x4_no_constrained_opc_needed
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256))
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128), align 32)
+ %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
+
+ S_ENDPGM 0
+...
+---
+
+name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec_no_constrained_opc_needed
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+
+ ; CHECK-LABEL: name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec_no_constrained_opc_needed
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128))
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub0_sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub2_sub3
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32 = COPY $sgpr4
+ %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM %0:sgpr_128, %1:sreg_32, 0, 0 :: (dereferenceable invariant load (s64), align 16)
+ %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM %0:sgpr_128, %1:sreg_32, 8, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
>From 07dfdda24278ecd4efb704bb9328a5c7037165b0 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Fri, 2 Aug 2024 16:50:34 +0530
Subject: [PATCH 2/4] fixed a typo in the function name.
---
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 7553c370f694f..df2270a904ffb 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1711,9 +1711,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}
-static bool needsConstraintedOpcode(const GCNSubtarget &STM,
- const MachineMemOperand *MMO,
- unsigned Width) {
+static bool needsConstrainedOpcode(const GCNSubtarget &STM,
+ const MachineMemOperand *MMO,
+ unsigned Width) {
return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
}
@@ -1736,7 +1736,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
llvm_unreachable("Unknown instruction class");
case S_BUFFER_LOAD_IMM: {
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc = needsConstrainedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
@@ -1756,7 +1756,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
}
case S_BUFFER_LOAD_SGPR_IMM: {
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc = needsConstrainedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
@@ -1778,7 +1778,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc = needsConstrainedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
>From f79e36490dfa598eafa1b8790bd61ada8753f22a Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Fri, 2 Aug 2024 17:04:34 +0530
Subject: [PATCH 3/4] validate MMO before using it.
---
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index df2270a904ffb..8c3d369756aee 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1735,8 +1735,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
case S_BUFFER_LOAD_IMM: {
+ // If XNACK is enabled, use the constrained opcodes when the first load is
+ // under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = needsConstrainedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc = MMO && needsConstrainedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
@@ -1755,8 +1757,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
}
}
case S_BUFFER_LOAD_SGPR_IMM: {
+ // If XNACK is enabled, use the constrained opcodes when the first load is
+ // under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = needsConstrainedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc = MMO && needsConstrainedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
@@ -1778,7 +1782,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = needsConstrainedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc = MMO && needsConstrainedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
>From 591ab6e4f7e90fb61ebf9b3d2aeadb85782da212 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Sat, 3 Aug 2024 00:53:17 +0530
Subject: [PATCH 4/4] improved needsConstrainedOpcode function to check for a
valid MMO.
---
.../lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 8c3d369756aee..b39fbdc26795c 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1712,9 +1712,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
}
static bool needsConstrainedOpcode(const GCNSubtarget &STM,
- const MachineMemOperand *MMO,
+ ArrayRef<MachineMemOperand *> MMOs,
unsigned Width) {
- return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+ // Conservatively returns true if not found the MMO.
+ return STM.isXNACKEnabled() &&
+ (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
}
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
@@ -1737,8 +1739,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case S_BUFFER_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
- const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = MMO && needsConstrainedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
@@ -1759,8 +1761,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case S_BUFFER_LOAD_SGPR_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
- const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = MMO && needsConstrainedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
@@ -1781,8 +1783,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
- const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc = MMO && needsConstrainedOpcode(*STM, MMO, Width);
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
More information about the llvm-commits
mailing list