[llvm] [AMDGPU][AMDGPURegBankInfo] Map S_BUFFER_LOAD_XXX to its corresponding BUFFER_LOAD_XXX (PR #117574)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 28 05:32:12 PST 2024
https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/117574
>From 30f8af8d35f2db933bc47ecf4588b83afd33d000 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 16:48:44 +0100
Subject: [PATCH] [AMDGPU][AMDGPURegBankInfo] Map S_BUFFER_LOAD_XXX to its
corresponding BUFFER_LOAD_XXX
In some tests code generation diverged between isel and selection-dag
For exmaple, this intrinsic
call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32
0)
would be lowered into these two cases:
* buffer_load_u16 v2, v2, s[0:3], null offen
* buffer_load_b32 v2, v2, s[0:3], null offen
This patch fixes this issue.
---
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 38 ++++++---
.../AMDGPU/gfx12_scalar_subword_loads.ll | 78 ++++++-------------
2 files changed, 52 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8c050348f753bb..40443d0fd31696 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1330,6 +1330,24 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets(
return 0;
}
+static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ return AMDGPU::G_AMDGPU_BUFFER_LOAD;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
+ return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
+ default:
+ break;
+ }
+ llvm_unreachable("Unexpected s_buffer_load opcode");
+}
+
bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -1406,16 +1424,16 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
if (i != 0)
BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
- B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
- .addDef(LoadParts[i]) // vdata
- .addUse(RSrc) // rsrc
- .addUse(VIndex) // vindex
- .addUse(VOffset) // voffset
- .addUse(SOffset) // soffset
- .addImm(ImmOffset + 16 * i) // offset(imm)
- .addImm(0) // cachepolicy, swizzled buffer(imm)
- .addImm(0) // idxen(imm)
- .addMemOperand(MMO);
+ B.buildInstr(getSBufferLoadCorrespondingBufferLoadOpcode(MI.getOpcode()))
+ .addDef(LoadParts[i]) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset + 16 * i) // offset(imm)
+ .addImm(0) // cachepolicy, swizzled buffer(imm)
+ .addImm(0) // idxen(imm)
+ .addMemOperand(MMO);
}
// TODO: If only the resource is a VGPR, it may be better to execute the
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 020c9dc130bb2a..61ae9639c52d00 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -465,19 +465,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i8 %ld to i32
@@ -538,20 +531,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i8 %ld to i32
@@ -606,19 +591,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i16 %ld to i32
@@ -679,20 +657,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i16 %ld to i32
More information about the llvm-commits
mailing list