[llvm] [AMDGPU][GFX12] Restrict scalar subword loads to PAL (PR #117576)

Juan Manuel Martinez CaamaƱo via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 26 08:27:19 PST 2024


https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/117576

>From 0b09ff508ec8a4bc865b88e3724bb50d41fc54f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 16:48:44 +0100
Subject: [PATCH 1/4] [AMDGPU][AMDGPURegBankInfo] Map S_BUFFER_LOAD_XXX to its
 corresponding BUFFER_LOAD_XXX

In some tests code generation diverged between isel and selection-dag

For exmaple, this intrinsic

    call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32
0)

would be lowered into these two cases:
* buffer_load_u16 v2, v2, s[0:3], null offen
* buffer_load_b32 v2, v2, s[0:3], null offen

This patch fixes this issue.
---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 41 +++++++---
 .../AMDGPU/gfx12_scalar_subword_loads.ll      | 78 ++++++-------------
 2 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b06bd4e334614f..6418402518262c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1406,16 +1406,37 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
     if (i != 0)
       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
 
-    B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
-      .addDef(LoadParts[i])       // vdata
-      .addUse(RSrc)               // rsrc
-      .addUse(VIndex)             // vindex
-      .addUse(VOffset)            // voffset
-      .addUse(SOffset)            // soffset
-      .addImm(ImmOffset + 16 * i) // offset(imm)
-      .addImm(0)                  // cachepolicy, swizzled buffer(imm)
-      .addImm(0)                  // idxen(imm)
-      .addMemOperand(MMO);
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
+      break;
+    default:
+      llvm_unreachable("Unexpected opcode");
+    }
+
+    B.buildInstr(Opc)
+        .addDef(LoadParts[i])       // vdata
+        .addUse(RSrc)               // rsrc
+        .addUse(VIndex)             // vindex
+        .addUse(VOffset)            // voffset
+        .addUse(SOffset)            // soffset
+        .addImm(ImmOffset + 16 * i) // offset(imm)
+        .addImm(0)                  // cachepolicy, swizzled buffer(imm)
+        .addImm(0)                  // idxen(imm)
+        .addMemOperand(MMO);
   }
 
   // TODO: If only the resource is a VGPR, it may be better to execute the
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 020c9dc130bb2a..61ae9639c52d00 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -465,19 +465,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i8 %ld to i32
@@ -538,20 +531,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i8 %ld to i32
@@ -606,19 +591,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i16 %ld to i32
@@ -679,20 +657,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i16 %ld to i32

>From 74f05bc6a5cb6e1dc2e69a27d5508953b5acd826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 11:32:39 +0100
Subject: [PATCH 2/4] [AMDGPU][GFX12] Pre-commit tests: Restrict scalar subword
 loads for GFX12

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp                | 5 +++++
 llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 51361b75940560..ff2af95ad08b43 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -51,6 +51,11 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
                            cl::desc("Enable the use of AA during codegen."),
                            cl::init(true));
 
+static cl::opt<bool> UseGFX12SubwordSBufferLoad(
+    "amdgpu-use-gfx12-subword-sbuffer-load",
+    cl::desc("Enable the use of s_buffer_load_(i/u)(8/16) instructions."),
+    cl::init(false));
+
 static cl::opt<unsigned>
     NSAThreshold("amdgpu-nsa-threshold",
                  cl::desc("Number of addresses from which to enable MIMG NSA."),
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 61ae9639c52d00..921bc23d17deb1 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
 define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_s_load_i8:

>From 859e1798b1a25890e81d6e61d7648f433a5a7d85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 14:47:23 +0100
Subject: [PATCH 3/4] [AMDGPU][DAG][GFX12] Restrict scalar subword loads on
 GFX12

On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
* the stride is not a multiple of 4, or
* the stride is 0 and the num-records is not a multiple of 4

For Vulkan and DX, it is guaranteed that the buffers stride/num-records are
aligned to 4.

This patch prevents the emission of scalar subword loads unless an
option forcing it is passed to the backend.

Solves SWDEV-498239
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |  13 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  81 ++--
 .../AMDGPU/gfx12_scalar_subword_loads.ll      | 396 +++++++++++++-----
 4 files changed, 356 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index ff2af95ad08b43..19757cfdc66c3d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -353,6 +353,19 @@ void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
   }
 }
 
+bool GCNSubtarget::hasScalarSubwordBufferLoads() const {
+  Generation Gen = getGeneration();
+
+  // On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
+  // * the stride is not a multiple of 4, or
+  // * the stride is 0 and the num-records is not a multiple of 4
+  // Avoid these instructions unless the frontend explicitly specifies that the
+  // input buffers are known to not trigger the bug.
+  if (Gen == GFX12)
+    return UseGFX12SubwordSBufferLoad;
+  return hasScalarSubwordLoads();
+}
+
 bool GCNSubtarget::hasMadF16() const {
   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..e0b0b26b5adea0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -466,6 +466,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
 
+  bool hasScalarSubwordBufferLoads() const;
+
   TrapHandlerAbi getTrapHandlerAbi() const {
     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f326416a324178..8f4b1c35174e7f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6438,7 +6438,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
               MachineMemOperand::MOInvariant,
           VT.getStoreSize(), Alignment);
       SDValue LoadVal;
-      if (!Offset->isDivergent()) {
+      if (!Offset->isDivergent() && Subtarget->hasScalarSubwordBufferLoads()) {
         SDValue Ops[] = {Rsrc, // source register
                          Offset, CachePolicy};
         SDValue BufferLoad =
@@ -8367,52 +8367,57 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
           MachineMemOperand::MOInvariant,
       VT.getStoreSize(), Alignment);
 
-  if (!Offset->isDivergent()) {
-    SDValue Ops[] = {Rsrc, Offset, CachePolicy};
-
-    // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
-    // s_buffer_load_u16 instruction is emitted for both signed and unsigned
-    // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
-    // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
-    if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
-      SDValue BufferLoad =
-          DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
-                                  DAG.getVTList(MVT::i32), Ops, VT, MMO);
+  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+  // assume that the buffer is unswizzled.
+  SDValue BufferLoadOps[] = {
+      DAG.getEntryNode(),                    // Chain
+      Rsrc,                                  // rsrc
+      DAG.getConstant(0, DL, MVT::i32),      // vindex
+      {},                                    // voffset
+      {},                                    // soffset
+      {},                                    // offset
+      CachePolicy,                           // cachepolicy
+      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+  };
+
+  if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+    if (!Offset->isDivergent() && Subtarget->hasScalarSubwordBufferLoads()) {
+      // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
+      // s_buffer_load_u16 instruction is emitted for both signed and unsigned
+      // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
+      // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
+      SDValue SBufferLoadOps[] = {Rsrc, Offset, CachePolicy};
+      SDValue BufferLoad = DAG.getMemIntrinsicNode(
+          AMDGPUISD::SBUFFER_LOAD_USHORT, DL, DAG.getVTList(MVT::i32),
+          SBufferLoadOps, VT, MMO);
       return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
     }
 
+    // If s_buffer_load_u16/u8 is not supported by the platform (gfx12 when we
+    // cannot ensure the buffer's num-records/stride is not properly aligned)
+    // lower to a buffer_load_u8/u16
+    setBufferOffsets(Offset, DAG, &BufferLoadOps[3], Align(4));
+    return handleByteShortBufferLoads(DAG, VT, DL, BufferLoadOps, MMO);
+  }
+
+  if (!Offset->isDivergent()) {
+    SDValue SBufferLoadOps[] = {Rsrc, Offset, CachePolicy};
+
     // Widen vec3 load to vec4.
     if (VT.isVector() && VT.getVectorNumElements() == 3 &&
         !Subtarget->hasScalarDwordx3Loads()) {
       EVT WidenedVT =
           EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
       auto WidenedOp = DAG.getMemIntrinsicNode(
-          AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
-          MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
+          AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), SBufferLoadOps,
+          WidenedVT, MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
       auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
                                    DAG.getVectorIdxConstant(0, DL));
       return Subvector;
     }
 
     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
-                                   DAG.getVTList(VT), Ops, VT, MMO);
-  }
-
-  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
-  // assume that the buffer is unswizzled.
-  SDValue Ops[] = {
-      DAG.getEntryNode(),                    // Chain
-      Rsrc,                                  // rsrc
-      DAG.getConstant(0, DL, MVT::i32),      // vindex
-      {},                                    // voffset
-      {},                                    // soffset
-      {},                                    // offset
-      CachePolicy,                           // cachepolicy
-      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
-  };
-  if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
-    setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
-    return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+                                   DAG.getVTList(VT), SBufferLoadOps, VT, MMO);
   }
 
   SmallVector<SDValue, 4> Loads;
@@ -8431,14 +8436,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
 
   // Use the alignment to ensure that the required offsets will fit into the
   // immediate offsets.
-  setBufferOffsets(Offset, DAG, &Ops[3],
+  setBufferOffsets(Offset, DAG, &BufferLoadOps[3],
                    NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
 
-  uint64_t InstOffset = Ops[5]->getAsZExtVal();
+  uint64_t InstOffset = BufferLoadOps[5]->getAsZExtVal();
   for (unsigned i = 0; i < NumLoads; ++i) {
-    Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
-    Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
-                                        LoadVT, MMO, DAG));
+    BufferLoadOps[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
+    Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+                                        BufferLoadOps, LoadVT, MMO, DAG));
   }
 
   if (NumElts == 8 || NumElts == 16)
@@ -12680,7 +12685,7 @@ SITargetLowering::performSignExtendInRegCombine(SDNode *N,
         VTSign->getVT() == MVT::i8) ||
        (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
         VTSign->getVT() == MVT::i16))) {
-    assert(Subtarget->hasScalarSubwordLoads() &&
+    assert(Subtarget->hasScalarSubwordBufferLoads() &&
            "s_buffer_load_{u8, i8} are supported "
            "in GFX12 (or newer) architectures.");
     EVT VT = Src.getValueType();
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 921bc23d17deb1..ae3eb6065363c2 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG,DAG-DEFAULT %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG,DAG-SBUFFER %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
 define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
@@ -421,13 +421,28 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
 }
 
 define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_byte_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_i8 s0, s[0:3], 0x4
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_byte_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    buffer_load_i8 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_i8 s0, s[0:3], 0x4
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_byte_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
   %sext = sext i8 %ld to i32
@@ -436,13 +451,29 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_byte_sgpr:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_byte_sgpr:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_sgpr:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i8 %ld to i32
@@ -451,13 +482,29 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %off, i32 0)
@@ -481,15 +528,32 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_ubyte_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_u8 s0, s[0:3], 0x4
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ubyte_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    buffer_load_u8 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_u8 s0, s[0:3], 0x4
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_and_b32 s0, s0, 0xff
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
+; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
   %zext = zext i8 %ld to i32
@@ -498,15 +562,33 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_ubyte_sgpr:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ubyte_sgpr:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_sgpr:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_and_b32 s0, s0, 0xff
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
+; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i8 %ld to i32
@@ -515,15 +597,33 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_and_b32 s0, s0, 0xff
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
+; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %off, i32 0)
@@ -547,13 +647,28 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_short_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_i16 s0, s[0:3], 0x4
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_short_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    buffer_load_i16 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_i16 s0, s[0:3], 0x4
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_short_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
   %sext = sext i16 %ld to i32
@@ -562,13 +677,29 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_short_sgpr:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_short_sgpr:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_sgpr:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i16 %ld to i32
@@ -577,13 +708,29 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %off, i32 0)
@@ -607,15 +754,32 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
-; GCN-LABEL: s_buffer_load_ushort_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_u16 s0, s[0:3], 0x4
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ushort_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    buffer_load_u16 v2, off, s[0:3], null offset:4
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_u16 s0, s[0:3], 0x4
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ushort_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], 0x4
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
+; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
   %zext = zext i16 %ld to i32
@@ -624,15 +788,33 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
-; GCN-LABEL: s_buffer_load_ushort_sgpr:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ushort_sgpr:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_sgpr:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
+; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i16 %ld to i32
@@ -641,15 +823,33 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
-; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_endpgm
+; DAG-DEFAULT-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; DAG-DEFAULT:       ; %bb.0: ; %main_body
+; DAG-DEFAULT-NEXT:    v_mov_b32_e32 v2, s4
+; DAG-DEFAULT-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen offset:100
+; DAG-DEFAULT-NEXT:    s_wait_loadcnt 0x0
+; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-DEFAULT-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; GISEL:       ; %bb.0: ; %main_body
+; GISEL-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
+;
+; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; DAG-SBUFFER:       ; %bb.0: ; %main_body
+; DAG-SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
+; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; DAG-SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %off, i32 0)

>From d875c5e7d223c3ca431dfa00c068ef3d9ea47b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 25 Nov 2024 14:54:40 +0100
Subject: [PATCH 4/4] [AMDGPU][GISEL][GFX12] Restrict scalar subword loads on
 GFX12

On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
* the stride is not a multiple of 4, or
* the stride is 0 and the num-records is not a multiple of 4

For Vulkan and DX, it is guaranteed that the buffers stride/num-records are
aligned to 4.

This patch prevents the emission of scalar subword loads unless an
option forcing it is passed to the backend.

Solves SWDEV-498239
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  36 ++-
 .../AMDGPU/gfx12_scalar_subword_loads.ll      | 304 ++++++------------
 2 files changed, 124 insertions(+), 216 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9bf1f281c32a09..f6c308d6e523ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6803,13 +6803,38 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
   unsigned Size = Ty.getSizeInBits();
   MachineFunction &MF = B.getMF();
   unsigned Opc = 0;
+
+  const unsigned MemSize = (Size + 7) / 8;
+  const Align MemAlign = B.getDataLayout().getABITypeAlign(
+      getTypeForLLT(Ty, MF.getFunction().getContext()));
+
+  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      MemSize, MemAlign);
+
   if (Size < 32 && ST.hasScalarSubwordLoads()) {
     assert(Size == 8 || Size == 16);
+    if (!ST.hasScalarSubwordBufferLoads()) {
+      // fallback to S_BUFFER_LOAD_UBYTE/USHORT
+      MI.getOperand(1).setIntrinsicID(Intrinsic::amdgcn_raw_buffer_load);
+
+      Register Zero = B.buildConstant(S32, 0).getReg(0);
+      MI.insert(MI.operands_begin() + 4,
+                MachineOperand::CreateReg(Zero, false));
+
+      MI.addMemOperand(MF, MMO);
+      Observer.changedInstr(MI);
+      return true;
+    }
+
     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
     // destination register.
-    Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+    Dst = B.getMRI()->createGenericVirtualRegister(S32);
   } else {
     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
     Dst = OrigDst;
@@ -6834,15 +6859,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
   MI.setDesc(B.getTII().get(Opc));
   MI.removeOperand(1); // Remove intrinsic ID
 
-  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
-  const unsigned MemSize = (Size + 7) / 8;
-  const Align MemAlign = B.getDataLayout().getABITypeAlign(
-      getTypeForLLT(Ty, MF.getFunction().getContext()));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-          MachineMemOperand::MOInvariant,
-      MemSize, MemAlign);
   MI.addMemOperand(MF, MMO);
   if (Dst != OrigDst) {
     MI.getOperand(0).setReg(Dst);
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index ae3eb6065363c2..5dc709d3be83dc 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG,DAG-DEFAULT %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG,DAG-SBUFFER %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,DAG,SBUFFER %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 -amdgpu-use-gfx12-subword-sbuffer-load < %s | FileCheck -check-prefixes=GCN,GISEL,SBUFFER %s
 
 define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_s_load_i8:
@@ -428,21 +428,13 @@ define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr a
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_byte_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_i8 s0, s[0:3], 0x4
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_byte_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_byte_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], 0x4
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
   %sext = sext i8 %ld to i32
@@ -459,21 +451,13 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspa
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_byte_sgpr:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_byte_sgpr:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i8 %ld to i32
@@ -490,21 +474,13 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %sr
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %off, i32 0)
@@ -535,25 +511,15 @@ define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_ubyte_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_u8 s0, s[0:3], 0x4
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xff
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
-; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ubyte_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], 0x4
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
+; SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
   %zext = zext i8 %ld to i32
@@ -570,25 +536,15 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrsp
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xff
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
-; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ubyte_sgpr:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
+; SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i8 %ld to i32
@@ -605,25 +561,15 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %s
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xff
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
-; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    s_and_b32 s0, s0, 0xff
+; SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %off, i32 0)
@@ -654,21 +600,13 @@ define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_short_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_i16 s0, s[0:3], 0x4
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_short_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_short_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], 0x4
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
   %sext = sext i16 %ld to i32
@@ -685,21 +623,13 @@ define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrsp
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_short_sgpr:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_short_sgpr:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i16 %ld to i32
@@ -716,21 +646,13 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %s
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %off, i32 0)
@@ -761,25 +683,15 @@ define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_ushort_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_u16 s0, s[0:3], 0x4
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ushort_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], 0x4
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
-; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ushort_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], 0x4
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
+; SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
   %zext = zext i16 %ld to i32
@@ -796,25 +708,15 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrs
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
-; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ushort_sgpr:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
+; SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i16 %ld to i32
@@ -831,25 +733,15 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %
 ; DAG-DEFAULT-NEXT:    global_store_b32 v[0:1], v2, off
 ; DAG-DEFAULT-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
-; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
-;
-; DAG-SBUFFER-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
-; DAG-SBUFFER:       ; %bb.0: ; %main_body
-; DAG-SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
-; DAG-SBUFFER-NEXT:    s_wait_kmcnt 0x0
-; DAG-SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
-; DAG-SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; DAG-SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
-; DAG-SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-SBUFFER-NEXT:    s_endpgm
+; SBUFFER-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; SBUFFER:       ; %bb.0: ; %main_body
+; SBUFFER-NEXT:    s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; SBUFFER-NEXT:    s_wait_kmcnt 0x0
+; SBUFFER-NEXT:    s_and_b32 s0, s0, 0xffff
+; SBUFFER-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SBUFFER-NEXT:    v_mov_b32_e32 v2, s0
+; SBUFFER-NEXT:    global_store_b32 v[0:1], v2, off
+; SBUFFER-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %off, i32 0)



More information about the llvm-commits mailing list