[llvm] [AMDGPU][GFX12] Restrict scalar subword loads to PAL (PR #117576)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 26 08:27:19 PST 2024
https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/117576
>From 0b09ff508ec8a4bc865b88e3724bb50d41fc54f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 16:48:44 +0100
Subject: [PATCH 1/4] [AMDGPU][AMDGPURegBankInfo] Map S_BUFFER_LOAD_XXX to its
corresponding BUFFER_LOAD_XXX
In some tests code generation diverged between isel and selection-dag
For exmaple, this intrinsic
call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32
0)
would be lowered into these two cases:
* buffer_load_u16 v2, v2, s[0:3], null offen
* buffer_load_b32 v2, v2, s[0:3], null offen
This patch fixes this issue.
---
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 41 +++++++---
.../AMDGPU/gfx12_scalar_subword_loads.ll | 78 ++++++-------------
2 files changed, 55 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b06bd4e334614f..6418402518262c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1406,16 +1406,37 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
if (i != 0)
BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
- B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
- .addDef(LoadParts[i]) // vdata
- .addUse(RSrc) // rsrc
- .addUse(VIndex) // vindex
- .addUse(VOffset) // voffset
- .addUse(SOffset) // soffset
- .addImm(ImmOffset + 16 * i) // offset(imm)
- .addImm(0) // cachepolicy, swizzled buffer(imm)
- .addImm(0) // idxen(imm)
- .addMemOperand(MMO);
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+ break;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+ break;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
+ break;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ break;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+
+ B.buildInstr(Opc)
+ .addDef(LoadParts[i]) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset + 16 * i) // offset(imm)
+ .addImm(0) // cachepolicy, swizzled buffer(imm)
+ .addImm(0) // idxen(imm)
+ .addMemOperand(MMO);
}
// TODO: If only the resource is a VGPR, it may be better to execute the
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 020c9dc130bb2a..61ae9639c52d00 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -465,19 +465,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i8 %ld to i32
@@ -538,20 +531,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i8 %ld to i32
@@ -606,19 +591,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i16 %ld to i32
@@ -679,20 +657,12 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; DAG: ; %bb.0: ; %main_body
-; DAG-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
-; DAG-NEXT: s_wait_loadcnt 0x0
-; DAG-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
+; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i16 %ld to i32
>From 74f05bc6a5cb6e1dc2e69a27d5508953b5acd826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 11:32:39 +0100
Subject: [PATCH 2/4] [AMDGPU][GFX12] Pre-commit tests: Restrict scalar subword
loads for GFX12
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 5 +++++
llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll | 2 ++
2 files changed, 7 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 51361b75940560..ff2af95ad08b43 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -51,6 +51,11 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
cl::desc("Enable the use of AA during codegen."),
cl::init(true));
+static cl::opt<bool> UseGFX12SubwordSBufferLoad(
+ "amdgpu-use-gfx12-subword-sbuffer-load",
+ cl::desc("Enable the use of s_buffer_load_(i/u)(8/16) instructions."),
+ cl::init(false));
+
static cl::opt<unsigned>
NSAThreshold("amdgpu-nsa-threshold",
cl::desc("Number of addresses from which to enable MIMG NSA."),
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 61ae9639c52d00..921bc23d17deb1 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL %s
define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_s_load_i8:
>From 859e1798b1a25890e81d6e61d7648f433a5a7d85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 14:47:23 +0100
Subject: [PATCH 3/4] [AMDGPU][DAG][GFX12] Restrict scalar subword loads on
GFX12
On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
* the stride is not a multiple of 4, or
* the stride is 0 and the num-records is not a multiple of 4
For Vulkan and DX, it is guaranteed that the buffers stride/num-records are
aligned to 4.
This patch prevents the emission of scalar subword loads unless an
option forcing it is passed to the backend.
Solves SWDEV-498239
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 13 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 81 ++--
.../AMDGPU/gfx12_scalar_subword_loads.ll | 396 +++++++++++++-----
4 files changed, 356 insertions(+), 136 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index ff2af95ad08b43..19757cfdc66c3d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -353,6 +353,19 @@ void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
}
}
+bool GCNSubtarget::hasScalarSubwordBufferLoads() const {
+ Generation Gen = getGeneration();
+
+ // On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
+ // * the stride is not a multiple of 4, or
+ // * the stride is 0 and the num-records is not a multiple of 4
+ // Avoid these instructions unless the frontend explicitly specifies that the
+ // input buffers are known to not trigger the bug.
+ if (Gen == GFX12)
+ return UseGFX12SubwordSBufferLoad;
+ return hasScalarSubwordLoads();
+}
+
bool GCNSubtarget::hasMadF16() const {
return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..e0b0b26b5adea0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -466,6 +466,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
+ bool hasScalarSubwordBufferLoads() const;
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f326416a324178..8f4b1c35174e7f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6438,7 +6438,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
MachineMemOperand::MOInvariant,
VT.getStoreSize(), Alignment);
SDValue LoadVal;
- if (!Offset->isDivergent()) {
+ if (!Offset->isDivergent() && Subtarget->hasScalarSubwordBufferLoads()) {
SDValue Ops[] = {Rsrc, // source register
Offset, CachePolicy};
SDValue BufferLoad =
@@ -8367,52 +8367,57 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
MachineMemOperand::MOInvariant,
VT.getStoreSize(), Alignment);
- if (!Offset->isDivergent()) {
- SDValue Ops[] = {Rsrc, Offset, CachePolicy};
-
- // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
- // s_buffer_load_u16 instruction is emitted for both signed and unsigned
- // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
- // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
- if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
- SDValue BufferLoad =
- DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
- DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+ // assume that the buffer is unswizzled.
+ SDValue BufferLoadOps[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ if (!Offset->isDivergent() && Subtarget->hasScalarSubwordBufferLoads()) {
+ // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
+ // s_buffer_load_u16 instruction is emitted for both signed and unsigned
+ // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
+ // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
+ SDValue SBufferLoadOps[] = {Rsrc, Offset, CachePolicy};
+ SDValue BufferLoad = DAG.getMemIntrinsicNode(
+ AMDGPUISD::SBUFFER_LOAD_USHORT, DL, DAG.getVTList(MVT::i32),
+ SBufferLoadOps, VT, MMO);
return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
}
+ // If s_buffer_load_u16/u8 is not supported by the platform (gfx12 when we
+ // cannot ensure the buffer's num-records/stride is not properly aligned)
+ // lower to a buffer_load_u8/u16
+ setBufferOffsets(Offset, DAG, &BufferLoadOps[3], Align(4));
+ return handleByteShortBufferLoads(DAG, VT, DL, BufferLoadOps, MMO);
+ }
+
+ if (!Offset->isDivergent()) {
+ SDValue SBufferLoadOps[] = {Rsrc, Offset, CachePolicy};
+
// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
!Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
- AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
- MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
+ AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), SBufferLoadOps,
+ WidenedVT, MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
DAG.getVectorIdxConstant(0, DL));
return Subvector;
}
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
- DAG.getVTList(VT), Ops, VT, MMO);
- }
-
- // We have a divergent offset. Emit a MUBUF buffer load instead. We can
- // assume that the buffer is unswizzled.
- SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- CachePolicy, // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
- };
- if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
- setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
- return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ DAG.getVTList(VT), SBufferLoadOps, VT, MMO);
}
SmallVector<SDValue, 4> Loads;
@@ -8431,14 +8436,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
- setBufferOffsets(Offset, DAG, &Ops[3],
+ setBufferOffsets(Offset, DAG, &BufferLoadOps[3],
NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
- uint64_t InstOffset = Ops[5]->getAsZExtVal();
+ uint64_t InstOffset = BufferLoadOps[5]->getAsZExtVal();
for (unsigned i = 0; i < NumLoads; ++i) {
- Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
- Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
- LoadVT, MMO, DAG));
+ BufferLoadOps[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
+ Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+ BufferLoadOps, LoadVT, MMO, DAG));
}
if (NumElts == 8 || NumElts == 16)
@@ -12680,7 +12685,7 @@ SITargetLowering::performSignExtendInRegCombine(SDNode *N,
VTSign->getVT() == MVT::i8) ||
(Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
VTSign->getVT() == MVT::i16))) {
- assert(Subtarget->hasScalarSubwordLoads() &&
+ assert(Subtarget->hasScalarSubwordBufferLoads() &&
"s_buffer_load_{u8, i8} are supported "
"in GFX12 (or newer) architectures.");
EVT VT = Src.getValueType();
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 921bc23d17deb1..ae3eb6065363c2 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG,DAG-DEFAULT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG,DAG-SBUFFER %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL %s
define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
@@ -421,13 +421,28 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
}
define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_byte_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_byte_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: buffer_load_i8 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_byte_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
%sext = sext i8 %ld to i32
@@ -436,13 +451,29 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_byte_sgpr:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_byte_sgpr:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_sgpr:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i8 %ld to i32
@@ -451,13 +482,29 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %off, i32 0)
@@ -481,15 +528,32 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_ubyte_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_and_b32 s0, s0, 0xff
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ubyte_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: buffer_load_u8 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_and_b32 s0, s0, 0xff
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
+; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
%zext = zext i8 %ld to i32
@@ -498,15 +562,33 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_ubyte_sgpr:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_and_b32 s0, s0, 0xff
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ubyte_sgpr:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_sgpr:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_and_b32 s0, s0, 0xff
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
+; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i8 %ld to i32
@@ -515,15 +597,33 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_and_b32 s0, s0, 0xff
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_and_b32 s0, s0, 0xff
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
+; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %off, i32 0)
@@ -547,13 +647,28 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_short_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_short_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: buffer_load_i16 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_short_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
%sext = sext i16 %ld to i32
@@ -562,13 +677,29 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_short_sgpr:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_short_sgpr:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_sgpr:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i16 %ld to i32
@@ -577,13 +708,29 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %off, i32 0)
@@ -607,15 +754,32 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_ushort_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ushort_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: buffer_load_u16 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ushort_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
+; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
%zext = zext i16 %ld to i32
@@ -624,15 +788,33 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_ushort_sgpr:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ushort_sgpr:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_sgpr:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
+; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i16 %ld to i32
@@ -641,15 +823,33 @@ main_body:
}
define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_b32 v[0:1], v2, off
-; GCN-NEXT: s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; DAG-DEFAULT: ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT: v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT: s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; DAG-SBUFFER: ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
+; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %off, i32 0)
>From d875c5e7d223c3ca431dfa00c068ef3d9ea47b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 14:54:40 +0100
Subject: [PATCH 4/4] [AMDGPU][GISEL][GFX12] Restrict scalar subword loads on
GFX12
On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
* the stride is not a multiple of 4, or
* the stride is 0 and the num-records is not a multiple of 4
For Vulkan and DX, it is guaranteed that the buffers stride/num-records are
aligned to 4.
This patch prevents the emission of scalar subword loads unless an
option forcing it is passed to the backend.
Solves SWDEV-498239
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 36 ++-
.../AMDGPU/gfx12_scalar_subword_loads.ll | 304 ++++++------------
2 files changed, 124 insertions(+), 216 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9bf1f281c32a09..f6c308d6e523ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6803,13 +6803,38 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
unsigned Size = Ty.getSizeInBits();
MachineFunction &MF = B.getMF();
unsigned Opc = 0;
+
+ const unsigned MemSize = (Size + 7) / 8;
+ const Align MemAlign = B.getDataLayout().getABITypeAlign(
+ getTypeForLLT(Ty, MF.getFunction().getContext()));
+
+ // FIXME: When intrinsic definition is fixed, this should have an MMO already.
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ MemSize, MemAlign);
+
if (Size < 32 && ST.hasScalarSubwordLoads()) {
assert(Size == 8 || Size == 16);
+ if (!ST.hasScalarSubwordBufferLoads()) {
+ // fallback to S_BUFFER_LOAD_UBYTE/USHORT
+ MI.getOperand(1).setIntrinsicID(Intrinsic::amdgcn_raw_buffer_load);
+
+ Register Zero = B.buildConstant(S32, 0).getReg(0);
+ MI.insert(MI.operands_begin() + 4,
+ MachineOperand::CreateReg(Zero, false));
+
+ MI.addMemOperand(MF, MMO);
+ Observer.changedInstr(MI);
+ return true;
+ }
+
Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
// destination register.
- Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+ Dst = B.getMRI()->createGenericVirtualRegister(S32);
} else {
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
Dst = OrigDst;
@@ -6834,15 +6859,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
MI.setDesc(B.getTII().get(Opc));
MI.removeOperand(1); // Remove intrinsic ID
- // FIXME: When intrinsic definition is fixed, this should have an MMO already.
- const unsigned MemSize = (Size + 7) / 8;
- const Align MemAlign = B.getDataLayout().getABITypeAlign(
- getTypeForLLT(Ty, MF.getFunction().getContext()));
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
if (Dst != OrigDst) {
MI.getOperand(0).setReg(Dst);
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index ae3eb6065363c2..5dc709d3be83dc 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG,DAG-DEFAULT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG,DAG-SBUFFER %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG,SBUFFER %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL,SBUFFER %s
define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_s_load_i8:
@@ -428,21 +428,13 @@ define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr a
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_byte_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_byte_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_byte_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
%sext = sext i8 %ld to i32
@@ -459,21 +451,13 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspa
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_byte_sgpr:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_byte_sgpr:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i8 %ld to i32
@@ -490,21 +474,13 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %sr
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %off, i32 0)
@@ -535,25 +511,15 @@ define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_ubyte_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_and_b32 s0, s0, 0xff
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
-; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ubyte_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
+; SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
%zext = zext i8 %ld to i32
@@ -570,25 +536,15 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrsp
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_and_b32 s0, s0, 0xff
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
-; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ubyte_sgpr:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
+; SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i8 %ld to i32
@@ -605,25 +561,15 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %s
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_and_b32 s0, s0, 0xff
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
-; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: s_and_b32 s0, s0, 0xff
+; SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %off, i32 0)
@@ -654,21 +600,13 @@ define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_short_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_short_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_short_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
%sext = sext i16 %ld to i32
@@ -685,21 +623,13 @@ define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrsp
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_short_sgpr:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_short_sgpr:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
%sext = sext i16 %ld to i32
@@ -716,21 +646,13 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %s
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %off, i32 0)
@@ -761,25 +683,15 @@ define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_ushort_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ushort_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
-; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ushort_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
+; SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
%zext = zext i16 %ld to i32
@@ -796,25 +708,15 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrs
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
-; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ushort_sgpr:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
+; SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
%zext = zext i16 %ld to i32
@@ -831,25 +733,15 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %
; DAG-DEFAULT-NEXT: global_store_b32 v[0:1], v2, off
; DAG-DEFAULT-NEXT: s_endpgm
;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
-; GISEL: ; %bb.0: ; %main_body
-; GISEL-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_mov_b32_e32 v2, s0
-; GISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GISEL-NEXT: s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
-; DAG-SBUFFER: ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT: s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
-; DAG-SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT: v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT: s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; SBUFFER: ; %bb.0: ; %main_body
+; SBUFFER-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT: s_wait_kmcnt 0x0
+; SBUFFER-NEXT: s_and_b32 s0, s0, 0xffff
+; SBUFFER-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT: v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT: global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT: s_endpgm
main_body:
%off = add nuw nsw i32 %in, 100
%ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %off, i32 0)
More information about the llvm-commits
mailing list