[llvm] [AMDGPU] Fix negative immediate offset for unbuffered smem loads (PR #79553)

Thu Mar 21 16:03:56 PDT 2024

https://github.com/vangthao95 updated https://github.com/llvm/llvm-project/pull/79553

>From 48b758ade80749278c1df6be280f46fafe359503 Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Fri, 26 Jan 2024 06:50:16 +0000
Subject: [PATCH 1/4] [AMDGPU] Fix negative immediate offset for unbuffered
 smem loads

For unbuffered smem loads, It is illegal and undefined for the immediate offset
to be negative if the resulting IOFFSET + (SGPR[Offset] or M0 or zero) is
negative. As a workaround for this issue, if there is no SGPR[Offset] and the
immediate offset is negative, subtract the absolute value of the immediate
offset from the base address. Then change the immediate offset to 0.
---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   4 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  49 ++++-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  14 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  35 ++-
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   8 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td      |   3 +-
 .../GlobalISel/inst-select-load-constant.mir  | 203 +++++++++++++++++-
 .../AMDGPU/cgp-addressing-modes-smem.ll       |   3 +-
 .../AMDGPU/gfx12_scalar_subword_loads.ll      |   6 +-
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll |  21 +-
 10 files changed, 319 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 152f495a452ba2..0017f51ee5d925 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -116,6 +116,10 @@ def gi_smrd_sgpr_imm :
     GIComplexOperandMatcher<s64, "selectSmrdSgprImm">,
     GIComplexPatternEquiv<SMRDSgprImm>;
 
+def gi_smrd_prefetch_imm :
+    GIComplexOperandMatcher<s64, "selectSmrdPrefetchImm">,
+    GIComplexPatternEquiv<SMRDPrefetchImm>;
+
 def gi_flat_offset :
     GIComplexOperandMatcher<s64, "selectFlatOffset">,
     GIComplexPatternEquiv<FlatOffset>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5278b552a65514..73374c6f9eddd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2071,13 +2071,16 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
 // true, match only 32-bit immediate offsets available on CI.
 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
                                               SDValue *SOffset, SDValue *Offset,
-                                              bool Imm32Only,
-                                              bool IsBuffer) const {
+                                              bool Imm32Only, bool IsBuffer,
+                                              bool IsPrefetch,
+                                              bool HasSOffset) const {
   if (SOffset && Offset) {
     assert(!Imm32Only && !IsBuffer);
     SDValue B;
-    return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
-           SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
+    return SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false,
+                                IsPrefetch, true) &&
+           SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false,
+                                IsPrefetch, true);
   }
 
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2096,12 +2099,39 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
   }
   if (!N0 || !N1)
     return false;
+
+  bool Selected = false;
   if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
     SBase = N0;
-    return true;
+    Selected = true;
   }
+
   if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
     SBase = N1;
+    Selected = true;
+  }
+
+  if (Selected) {
+    // For unbuffered smem loads, it is illegal and undefined for the Immediate
+    // Offset to be negative if the resulting (Offset + (M0 or SOffset or zero)
+    // is negative. Handle the case where the Immediate Offset is negative and
+    // there is no SOffset.
+    //
+    // FIXME: Also handle M0 or SOffset case?
+    if (Offset && !HasSOffset && !IsBuffer && !IsPrefetch &&
+        Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) {
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) {
+        if (C->getSExtValue() < 0) {
+          SDLoc SL(SBase);
+          *Offset = CurDAG->getTargetConstant(std::abs(C->getSExtValue()), SL,
+                                              MVT::i32);
+          const SDValue Ops[] = {SBase, *Offset};
+          SBase = SDValue(
+              CurDAG->getMachineNode(AMDGPU::S_SUB_U64, SL, MVT::i64, Ops), 0);
+          *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+        }
+      }
+    }
     return true;
   }
   return false;
@@ -2109,8 +2139,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
 
 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                     SDValue *SOffset, SDValue *Offset,
-                                    bool Imm32Only) const {
-  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+                                    bool Imm32Only, bool IsPrefetch) const {
+  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only, IsPrefetch)) {
     SBase = Expand32BitAddress(SBase);
     return true;
   }
@@ -2169,6 +2199,11 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                               /* IsBuffer */ true);
 }
 
+bool AMDGPUDAGToDAGISel::SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
+                                       SDValue &Offset) const {
+  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, false, true);
+}
+
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
                                             SDValue &Base,
                                             SDValue &Offset) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 3b42d88df0c246..5328ba985474dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -194,11 +194,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                         SDValue *Offset, bool Imm32Only = false,
                         bool IsBuffer = false) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
-  bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                            SDValue *Offset, bool Imm32Only = false,
-                            bool IsBuffer = false) const;
-  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                  SDValue *Offset, bool Imm32Only = false) const;
+  bool SelectSMRDBaseOffset(SDValue Addr, SDValue & SBase, SDValue * SOffset,
+                            SDValue * Offset, bool Imm32Only = false,
+                            bool IsBuffer = false, bool IsPrefetch = false,
+                            bool HasSOffset = false) const;
+  bool SelectSMRD(SDValue Addr, SDValue & SBase, SDValue * SOffset,
+                  SDValue * Offset, bool Imm32Only = false,
+                  bool IsPrefetch = false) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
@@ -208,6 +210,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                                SDValue &Offset) const;
+  bool SelectSMRDPrefetchImm(SDValue Addr, SDValue & SBase, SDValue & Offset)
+      const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
 
   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f255d098b631c7..c7cc701c63dc54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4221,7 +4221,8 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
                                                  Register &Base,
                                                  Register *SOffset,
-                                                 int64_t *Offset) const {
+                                                 int64_t *Offset,
+                                                 bool IsPrefetch) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
 
@@ -4257,6 +4258,27 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
     Base = GEPI.SgprParts[0];
     *Offset = *EncodedImm;
+    // For unbuffered smem loads, it is illegal and undefined for the Immediate
+    // Offset to be negative if the resulting (Offset + (M0 or SOffset or zero)
+    // is negative. Handle the case where the Immediate Offset is negative and
+    // there is no SOffset.
+    //
+    // FIXME: Also handle M0 or SOffset case?
+    if (!IsPrefetch && *Offset < 0 &&
+        STI.getGeneration() >= AMDGPUSubtarget::GFX11) {
+      // Subtract the absolute value of the offset from the base register and
+      // set the immediate offset to 0.
+      Register SubtractReg =
+          MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U64),
+              SubtractReg)
+          .addReg(Base)
+          .addImm(std::abs(*Offset));
+      Base = SubtractReg;
+      *Offset = 0;
+    }
+
     return true;
   }
 
@@ -4339,6 +4361,17 @@ AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdPrefetchImm(MachineOperand &Root) const {
+  Register Base;
+  int64_t Offset;
+  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset, true))
+    return std::nullopt;
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+}
+
 std::pair<Register, int>
 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
                                                 uint64_t FlatVariant) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index ef7630f137aca6..573bc9260c765b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -220,8 +220,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectVINTERPModsHi(MachineOperand &Root) const;
 
-  bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
-                        int64_t *Offset) const;
+  bool selectSmrdOffset(MachineOperand & Root, Register & Base,
+                        Register * SOffset, int64_t * Offset,
+                        bool IsPrefetch = false) const;
+
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
@@ -230,6 +232,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   selectSmrdSgpr(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdSgprImm(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSmrdPrefetchImm(MachineOperand &Root) const;
 
   std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root,
                                                 uint64_t FlatVariant) const;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index f3096962e2f3e8..33127ac281f677 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -859,6 +859,7 @@ def SMRDSgprImm     : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
 def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
+def SMRDPrefetchImm : ComplexPattern<iPTR, 2, "SelectSMRDPrefetchImm">;
 
 multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
 
@@ -1078,7 +1079,7 @@ def i32imm_one : TImmLeaf <i32, [{
 
 multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
   def : GCNPat <
-    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, cache_type),
+    (smrd_prefetch (SMRDPrefetchImm i64:$sbase, i32:$offset), timm, timm, cache_type),
     (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
   >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index c44477273dad09..b7010e4c65beb9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -3,7 +3,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
 
 ---
 
@@ -44,6 +44,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -89,6 +96,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s16_from_4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -133,6 +147,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s32
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -176,6 +197,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s32_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -219,6 +247,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v4s16_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -263,6 +298,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v4s32_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x  s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -307,6 +349,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s64
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -351,6 +400,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s64_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -395,6 +451,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s64
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -439,6 +502,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
+    ; GFX11-LABEL: name: load_constant_v2p1
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -483,6 +553,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128)
+    ;
+    ; GFX11-LABEL: name: load_constant_s128_align4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -527,6 +604,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_p3_from_4
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -571,6 +655,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_p4_from_8
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p4) = G_LOAD %0 :: (load (p4), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -615,6 +706,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999)
+    ;
+    ; GFX11-LABEL: name: load_constant_p999_from_8
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -659,6 +757,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX11-LABEL: name: load_constant_v2p3
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -703,6 +808,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v2s16
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -747,6 +859,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v4s16
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -791,6 +910,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v8s16
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -835,6 +961,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v8s32
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
@@ -879,6 +1012,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v16s32
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -923,6 +1063,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_v8s64
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -971,6 +1118,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1020
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1020
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1018,6 +1172,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1024
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1024
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1067,6 +1228,14 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048575
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575
+    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1048575
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1116,6 +1285,14 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048576
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576
+    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1048576
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1166,6 +1343,14 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1073741823
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823
+    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1073741823
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1236,6 +1421,14 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_1
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_SUB_U64_:%[0-9]+]]:sreg_64 = S_SUB_U64 [[COPY]], 1
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U64_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1306,6 +1499,14 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
+    ;
+    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_524288
+    ; GFX11: liveins: $sgpr0_sgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX11-NEXT: [[S_SUB_U64_:%[0-9]+]]:sreg_64 = S_SUB_U64 [[COPY]], 524288
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U64_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -524288
     %2:sgpr(p4) = G_PTR_ADD %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index 54dc5b8b9d3dd6..a76216fc8f6a32 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -307,10 +307,11 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ;
 ; GFX12-LABEL: test_sink_smem_offset_neg400:
 ; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 0x190
 ; GFX12-NEXT:  .LBB5_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 6c324ddc654667..2ae9524d2a842e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -21,7 +21,8 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(
 define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_s_load_i8_imm:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_i8 s0, s[0:1], -0x64
+; GCN-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 0x64
+; GCN-NEXT:    s_load_i8 s0, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
@@ -197,7 +198,8 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace
 define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_s_load_i16_imm:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_i16 s0, s[0:1], -0xc8
+; GCN-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 0xc8
+; GCN-NEXT:    s_load_i16 s0, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 6d99485b91fe48..fa51dc374be230 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -159,7 +159,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_neg4096:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x1000
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x1000
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -200,7 +201,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_neg4097:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x1001
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x1001
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -241,7 +243,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_neg4098:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x1002
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x1002
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -378,7 +381,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_neg2048:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x800
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x800
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -415,7 +419,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_neg2049:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x801
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x801
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -452,7 +457,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_neg2050:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x802
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x802
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -527,7 +533,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in
 ;
 ; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], -0x800000
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 0x800000
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog

>From 8194ef18f3b4a3898f4ec1791a5b81a4f70f35ec Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Fri, 2 Feb 2024 23:01:27 +0000
Subject: [PATCH 2/4] Fix formatting issues and change if statements in
 SelectSMRDBaseOffset() to if-else if.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp      |  9 ++++-----
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h        | 12 ++++++------
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h |  5 ++---
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 73374c6f9eddd4..ffa2f42a9e7a91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2104,9 +2104,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
   if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
     SBase = N0;
     Selected = true;
-  }
-
-  if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
+  } else if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
     SBase = N1;
     Selected = true;
   }
@@ -2140,7 +2138,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                     SDValue *SOffset, SDValue *Offset,
                                     bool Imm32Only, bool IsPrefetch) const {
-  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only, IsPrefetch)) {
+  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only,
+                           IsPrefetch)) {
     SBase = Expand32BitAddress(SBase);
     return true;
   }
@@ -2200,7 +2199,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
-                                       SDValue &Offset) const {
+                                               SDValue &Offset) const {
   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, false, true);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5328ba985474dd..24bb9fd905af52 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -194,12 +194,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                         SDValue *Offset, bool Imm32Only = false,
                         bool IsBuffer = false) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
-  bool SelectSMRDBaseOffset(SDValue Addr, SDValue & SBase, SDValue * SOffset,
-                            SDValue * Offset, bool Imm32Only = false,
+  bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
+                            SDValue *Offset, bool Imm32Only = false,
                             bool IsBuffer = false, bool IsPrefetch = false,
                             bool HasSOffset = false) const;
-  bool SelectSMRD(SDValue Addr, SDValue & SBase, SDValue * SOffset,
-                  SDValue * Offset, bool Imm32Only = false,
+  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
+                  SDValue *Offset, bool Imm32Only = false,
                   bool IsPrefetch = false) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -210,8 +210,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                                SDValue &Offset) const;
-  bool SelectSMRDPrefetchImm(SDValue Addr, SDValue & SBase, SDValue & Offset)
-      const;
+  bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
+                             SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
 
   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 573bc9260c765b..473068e7f6ac99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -220,9 +220,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectVINTERPModsHi(MachineOperand &Root) const;
 
-  bool selectSmrdOffset(MachineOperand & Root, Register & Base,
-                        Register * SOffset, int64_t * Offset,
-                        bool IsPrefetch = false) const;
+  bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
+                        int64_t *Offset, bool IsPrefetch = false) const;
 
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;

>From d7b3bb5631b13ac06c46bac883b9bf823acee5b7 Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Mon, 12 Feb 2024 19:41:28 +0000
Subject: [PATCH 3/4] Add check for signed imm offset support and change
 generation check to gfx9+

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp     | 15 +++++++++++----
 llvm/lib/Target/AMDGPU/GCNSubtarget.h             |  4 ++++
 .../CodeGen/AMDGPU/cgp-addressing-modes-smem.ll   |  6 ++++--
 llvm/test/CodeGen/AMDGPU/smrd.ll                  |  4 ++--
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ffa2f42a9e7a91..dcb6c892713948 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2117,15 +2117,21 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
     //
     // FIXME: Also handle M0 or SOffset case?
     if (Offset && !HasSOffset && !IsBuffer && !IsPrefetch &&
-        Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) {
+        Subtarget->hasSignedSMRDImmOffset()) {
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) {
         if (C->getSExtValue() < 0) {
           SDLoc SL(SBase);
           *Offset = CurDAG->getTargetConstant(std::abs(C->getSExtValue()), SL,
                                               MVT::i32);
           const SDValue Ops[] = {SBase, *Offset};
-          SBase = SDValue(
-              CurDAG->getMachineNode(AMDGPU::S_SUB_U64, SL, MVT::i64, Ops), 0);
+          if (Subtarget->hasScalarAddSub64())
+            SBase = SDValue(
+                CurDAG->getMachineNode(AMDGPU::S_SUB_U64, SL, MVT::i64, Ops),
+                0);
+          else
+            SBase = SDValue(CurDAG->getMachineNode(AMDGPU::S_SUB_U64_PSEUDO, SL,
+                                                   MVT::i64, Ops),
+                            0);
           *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
         }
       }
@@ -2200,7 +2206,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
                                                SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, false, true);
+  return SelectSMRD(Addr, SBase, /*SOffset=*/ nullptr, &Offset,
+                    /*Imm32Only=*/ false, /*IsPrefetch=*/ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4f8eeaaf500b4d..aa4c30b32d8408 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1290,6 +1290,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // of sign-extending.
   bool hasGetPCZeroExtension() const { return GFX12Insts; }
 
+  // \returns true if the target supports signed immediate offset for SMRD
+  // instructions.
+  bool hasSignedSMRDImmOffset() const { return getGeneration() >= GFX9; }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index a76216fc8f6a32..abcac52d135fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -297,9 +297,11 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:  .LBB5_1: ; %loop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[0:1], -0x190
 ; GFX9-NEXT:    s_add_i32 s2, s2, -1
+; GFX9-NEXT:    s_sub_u32 s4, s0, 0x190
+; GFX9-NEXT:    s_subb_u32 s5, s1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %end
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 4ce9260b8d53de..ba8ebb44b9c0d5 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -88,11 +88,11 @@ entry:
   ret void
 }
 
-; GFX9_10 can use a signed immediate byte offset
+; GFX9_10 can use a signed immediate byte offset but not without sgpr[offset]
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
-; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, -0x4
+; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
   %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1

>From 6f0b4dd55b43fa7c1151bbdc745e4f896aa7fddf Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Thu, 21 Mar 2024 18:48:59 -0400
Subject: [PATCH 4/4] Remove smrd prefetch changes and use computeKnownBits()
 for sgpr[offset].

---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   4 -
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 122 ++++++++++--------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  13 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  67 +++++-----
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   6 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td      |   3 +-
 .../GlobalISel/inst-select-load-constant.mir  |  14 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll   |   3 +-
 .../AMDGPU/gfx12_scalar_subword_loads.ll      |  80 ++++++++++++
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll |   3 +-
 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll     |   6 +-
 llvm/test/CodeGen/AMDGPU/smrd.ll              |   2 +-
 12 files changed, 213 insertions(+), 110 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 0017f51ee5d925..152f495a452ba2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -116,10 +116,6 @@ def gi_smrd_sgpr_imm :
     GIComplexOperandMatcher<s64, "selectSmrdSgprImm">,
     GIComplexPatternEquiv<SMRDSgprImm>;
 
-def gi_smrd_prefetch_imm :
-    GIComplexOperandMatcher<s64, "selectSmrdPrefetchImm">,
-    GIComplexPatternEquiv<SMRDPrefetchImm>;
-
 def gi_flat_offset :
     GIComplexOperandMatcher<s64, "selectFlatOffset">,
     GIComplexPatternEquiv<FlatOffset>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index dcb6c892713948..190e4655184b44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1979,12 +1979,37 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   return true;
 }
 
+// Subtract the absolute value of the immediate offset from SBase and set the
+// immediate offset to 0.
+bool AMDGPUDAGToDAGISel::subtractOffsetFromBase(SDValue *SBase,
+                                                SDValue *Offset) const {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset);
+  if (!C)
+    return false;
+
+  *Offset = CurDAG->getTargetConstant(std::abs(C->getSExtValue()),
+                                      SDLoc(*Offset), MVT::i32);
+  const SDValue Ops[] = {*SBase, *Offset};
+  unsigned Opc;
+
+  if (Subtarget->hasScalarAddSub64())
+    Opc = AMDGPU::S_SUB_U64;
+  else
+    Opc = AMDGPU::S_SUB_U64_PSEUDO;
+  *SBase =
+      SDValue(CurDAG->getMachineNode(Opc, SDLoc(*SBase), MVT::i64, Ops), 0);
+  *Offset = CurDAG->getTargetConstant(0, SDLoc(*Offset), MVT::i32);
+  return true;
+}
+
 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
 // not null) offset. If Imm32Only is true, match only 32-bit immediate
 // offsets available on CI.
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
-                                          SDValue *SOffset, SDValue *Offset,
-                                          bool Imm32Only, bool IsBuffer) const {
+                                          SDValue *SBase, SDValue *SOffset,
+                                          SDValue *Offset, bool Imm32Only,
+                                          bool IsBuffer,
+                                          bool HasSOffset) const {
   assert((!SOffset || !Offset) &&
          "Cannot match both soffset and offset at the same time!");
 
@@ -2015,7 +2040,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
   if (EncodedOffset && Offset && !Imm32Only) {
     *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
-    return true;
+    if (EncodedOffset >= 0 || IsBuffer || HasSOffset ||
+        !Subtarget->hasSignedSMRDImmOffset())
+      return true;
+    // For unbuffered smem loads, it is illegal and undefined for the Immediate
+    // Offset to be negative if the resulting (Offset + (M0 or SOffset or zero)
+    // is negative. Handle the case where the Immediate Offset is negative and
+    // there is no SOffset.
+    return subtractOffsetFromBase(SBase, Offset);
   }
 
   // SGPR and literal offsets are unsigned.
@@ -2072,15 +2104,34 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
                                               SDValue *SOffset, SDValue *Offset,
                                               bool Imm32Only, bool IsBuffer,
-                                              bool IsPrefetch,
                                               bool HasSOffset) const {
   if (SOffset && Offset) {
     assert(!Imm32Only && !IsBuffer);
     SDValue B;
-    return SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false,
-                                IsPrefetch, true) &&
-           SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false,
-                                IsPrefetch, true);
+    if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+      return false;
+
+    if (!SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true))
+      return false;
+
+    if (IsBuffer || Imm32Only || !Subtarget->hasSignedSMRDImmOffset())
+      return true;
+
+    // For unbuffered smem loads, it is illegal and undefined for the Immediate
+    // Offset to be negative if the resulting (Offset + (M0 or SOffset or zero)
+    // is negative. Handle the case where the Immediate Offset + SOffset is
+    // negative.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) {
+      int64_t ByteOffset = C->getSExtValue();
+      if (ByteOffset >= 0)
+        return true;
+
+      KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
+      if (ByteOffset + SKnown.getMinValue().getSExtValue() < 0)
+        return subtractOffsetFromBase(&SBase, Offset);
+    }
+
+    return true;
   }
 
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2100,42 +2151,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
   if (!N0 || !N1)
     return false;
 
-  bool Selected = false;
-  if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
+  if (SelectSMRDOffset(N1, &N0, SOffset, Offset, Imm32Only, IsBuffer,
+                       HasSOffset)) {
     SBase = N0;
-    Selected = true;
-  } else if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
-    SBase = N1;
-    Selected = true;
+    return true;
   }
-
-  if (Selected) {
-    // For unbuffered smem loads, it is illegal and undefined for the Immediate
-    // Offset to be negative if the resulting (Offset + (M0 or SOffset or zero)
-    // is negative. Handle the case where the Immediate Offset is negative and
-    // there is no SOffset.
-    //
-    // FIXME: Also handle M0 or SOffset case?
-    if (Offset && !HasSOffset && !IsBuffer && !IsPrefetch &&
-        Subtarget->hasSignedSMRDImmOffset()) {
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) {
-        if (C->getSExtValue() < 0) {
-          SDLoc SL(SBase);
-          *Offset = CurDAG->getTargetConstant(std::abs(C->getSExtValue()), SL,
-                                              MVT::i32);
-          const SDValue Ops[] = {SBase, *Offset};
-          if (Subtarget->hasScalarAddSub64())
-            SBase = SDValue(
-                CurDAG->getMachineNode(AMDGPU::S_SUB_U64, SL, MVT::i64, Ops),
-                0);
-          else
-            SBase = SDValue(CurDAG->getMachineNode(AMDGPU::S_SUB_U64_PSEUDO, SL,
-                                                   MVT::i64, Ops),
-                            0);
-          *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
-        }
-      }
-    }
+  if (SelectSMRDOffset(N0, &N1, SOffset, Offset, Imm32Only, IsBuffer,
+                       HasSOffset)) {
+    SBase = N1;
     return true;
   }
   return false;
@@ -2143,9 +2166,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
 
 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
                                     SDValue *SOffset, SDValue *Offset,
-                                    bool Imm32Only, bool IsPrefetch) const {
-  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only,
-                           IsPrefetch)) {
+                                    bool Imm32Only) const {
+  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
     SBase = Expand32BitAddress(SBase);
     return true;
   }
@@ -2183,14 +2205,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
-  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+  return SelectSMRDOffset(N, /*SBase=*/nullptr, /* SOffset */ nullptr, &Offset,
                           /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
                                                SDValue &Offset) const {
   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+  return SelectSMRDOffset(N, /*SBase=*/nullptr, /* SOffset */ nullptr, &Offset,
                           /* Imm32Only */ true, /* IsBuffer */ true);
 }
 
@@ -2204,12 +2226,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                               /* IsBuffer */ true);
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
-                                               SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, /*SOffset=*/ nullptr, &Offset,
-                    /*Imm32Only=*/ false, /*IsPrefetch=*/ true);
-}
-
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
                                             SDValue &Base,
                                             SDValue &Offset) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 24bb9fd905af52..72b13aebf2e102 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -190,17 +190,17 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
                            SDValue &SAddr, SDValue &Offset) const;
 
-  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
-                        SDValue *Offset, bool Imm32Only = false,
-                        bool IsBuffer = false) const;
+  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SBase,
+                        SDValue *SOffset, SDValue *Offset,
+                        bool Imm32Only = false, bool IsBuffer = false,
+                        bool HasSOffset = false) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
   bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
                             SDValue *Offset, bool Imm32Only = false,
-                            bool IsBuffer = false, bool IsPrefetch = false,
+                            bool IsBuffer = false,
                             bool HasSOffset = false) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                  SDValue *Offset, bool Imm32Only = false,
-                  bool IsPrefetch = false) const;
+                  SDValue *Offset, bool Imm32Only = false) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
@@ -263,6 +263,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                                 SDValue &SrcMods) const;
   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
+  bool subtractOffsetFromBase(SDValue *SBase, SDValue *Offset) const;
   SDValue getHi16Elt(SDValue In) const;
 
   SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c7cc701c63dc54..b0f1e1a1a52de8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4218,11 +4218,31 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
   }};
 }
 
+// Subtract the absolute value of the immediate offset from SBase and set the
+// immediate offset to 0.
+bool AMDGPUInstructionSelector::subtractOffsetFromBase(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB,
+                                                       Register &Base,
+                                                       int64_t *Offset) const {
+  Register SubtractReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned Opc;
+
+  if (Subtarget->hasScalarAddSub64())
+    Opc = AMDGPU::S_SUB_U64;
+  else
+    Opc = AMDGPU::S_SUB_U64_PSEUDO;
+
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(Opc), SubtractReg)
+      .addReg(Base)
+      .addImm(std::abs(*Offset));
+  Base = SubtractReg;
+  *Offset = 0;
+  return true;
+}
 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
                                                  Register &Base,
                                                  Register *SOffset,
-                                                 int64_t *Offset,
-                                                 bool IsPrefetch) const {
+                                                 int64_t *Offset) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
 
@@ -4248,6 +4268,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
           Base = GEPI2.SgprParts[0];
           *SOffset = OffsetReg;
           *Offset = *EncodedImm;
+          if (*Offset >= 0 || !STI.hasSignedSMRDImmOffset())
+            return true;
+
+          // For unbuffered smem loads, it is illegal and undefined for the
+          // Immediate Offset to be negative if the resulting (Offset + (M0 or
+          // SOffset or zero) is negative. Handle the case where the Immediate
+          // Offset + SOffset is negative.
+          auto SKnown = KB->getKnownBits(*SOffset);
+          if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
+            return subtractOffsetFromBase(MI, MBB, Base, Offset);
+
           return true;
         }
       }
@@ -4258,28 +4289,13 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
     Base = GEPI.SgprParts[0];
     *Offset = *EncodedImm;
+    if (*Offset >= 0 || !STI.hasSignedSMRDImmOffset())
+      return true;
     // For unbuffered smem loads, it is illegal and undefined for the Immediate
     // Offset to be negative if the resulting (Offset + (M0 or SOffset or zero)
     // is negative. Handle the case where the Immediate Offset is negative and
     // there is no SOffset.
-    //
-    // FIXME: Also handle M0 or SOffset case?
-    if (!IsPrefetch && *Offset < 0 &&
-        STI.getGeneration() >= AMDGPUSubtarget::GFX11) {
-      // Subtract the absolute value of the offset from the base register and
-      // set the immediate offset to 0.
-      Register SubtractReg =
-          MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-
-      BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U64),
-              SubtractReg)
-          .addReg(Base)
-          .addImm(std::abs(*Offset));
-      Base = SubtractReg;
-      *Offset = 0;
-    }
-
-    return true;
+    return subtractOffsetFromBase(MI, MBB, Base, Offset);
   }
 
   // SGPR offset is unsigned.
@@ -4361,17 +4377,6 @@ AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
 }
 
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectSmrdPrefetchImm(MachineOperand &Root) const {
-  Register Base;
-  int64_t Offset;
-  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset, true))
-    return std::nullopt;
-
-  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
-           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
-}
-
 std::pair<Register, int>
 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
                                                 uint64_t FlatVariant) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 473068e7f6ac99..6a776d4da50b8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -221,7 +221,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   selectVINTERPModsHi(MachineOperand &Root) const;
 
   bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
-                        int64_t *Offset, bool IsPrefetch = false) const;
+                        int64_t *Offset) const;
+  bool subtractOffsetFromBase(MachineInstr *MI, MachineBasicBlock *MBB,
+                              Register &Base, int64_t *Offset) const;
 
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
@@ -231,8 +233,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   selectSmrdSgpr(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdSgprImm(MachineOperand &Root) const;
-  InstructionSelector::ComplexRendererFns
-  selectSmrdPrefetchImm(MachineOperand &Root) const;
 
   std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root,
                                                 uint64_t FlatVariant) const;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 33127ac281f677..f3096962e2f3e8 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -859,7 +859,6 @@ def SMRDSgprImm     : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
 def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
-def SMRDPrefetchImm : ComplexPattern<iPTR, 2, "SelectSMRDPrefetchImm">;
 
 multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
 
@@ -1079,7 +1078,7 @@ def i32imm_one : TImmLeaf <i32, [{
 
 multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
   def : GCNPat <
-    (smrd_prefetch (SMRDPrefetchImm i64:$sbase, i32:$offset), timm, timm, cache_type),
+    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, cache_type),
     (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
   >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index b7010e4c65beb9..9b18c6bccbbad7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -1419,15 +1419,16 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4)
+    ; GFX10-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[COPY]], 1, implicit-def $scc
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_1
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_SUB_U64_:%[0-9]+]]:sreg_64 = S_SUB_U64 [[COPY]], 1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U64_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[COPY]], 1, implicit-def $scc
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1
@@ -1497,15 +1498,16 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4)
+    ; GFX10-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[COPY]], 524288, implicit-def $scc
+    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_524288
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_SUB_U64_:%[0-9]+]]:sreg_64 = S_SUB_U64 [[COPY]], 524288
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U64_]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX11-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[COPY]], 524288, implicit-def $scc
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_SUB_U]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -524288
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 139f82b3dc9f7a..f385fb12fce55f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -92,7 +92,8 @@ entry:
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
-; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4
+; GFX9_10: s_sub_u32 s{{[0-9]}}, s{{[0-9]}}, 4
+; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
   %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 2ae9524d2a842e..f8eec5a216aa01 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -440,6 +440,86 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
   ret void
 }
 
+define amdgpu_kernel void @test_s_load_no_neg_offset(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
+; DAG-LABEL: test_s_load_no_neg_offset:
+; DAG:       ; %bb.0: ; %entry
+; DAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; DAG-NEXT:    s_wait_kmcnt 0x0
+; DAG-NEXT:    s_sub_nc_u64 s[2:3], s[2:3], 4
+; DAG-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; DAG-NEXT:    s_wait_kmcnt 0x0
+; DAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; DAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; DAG-NEXT:    s_nop 0
+; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_s_load_no_neg_offset:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_sub_nc_u64 s[2:3], s[2:3], 4
+; GISEL-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
+entry:
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps float @test_s_load_sgpr_offset_with_neg_imm_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
+; GCN-LABEL: test_s_load_sgpr_offset_with_neg_imm_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_or_b32 s2, s2, 24
+; GCN-NEXT:    s_load_u8 s0, s[0:1], s2 offset:-0x18
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %1 = or i32 %soffset, 24
+  %zext.offset = zext i32 %1 to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
+  %load = load i8, ptr addrspace(1) %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @test_s_load_no_neg_imm_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
+; DAG-LABEL: test_s_load_no_neg_imm_offset:
+; DAG:       ; %bb.0:
+; DAG-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 24
+; DAG-NEXT:    s_or_b32 s2, s2, 16
+; DAG-NEXT:    s_load_u8 s0, s[0:1], s2 offset:0x0
+; DAG-NEXT:    s_wait_kmcnt 0x0
+; DAG-NEXT:    v_mov_b32_e32 v0, s0
+; DAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: test_s_load_no_neg_imm_offset:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_or_b32 s2, s2, 16
+; GISEL-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 24
+; GISEL-NEXT:    s_load_u8 s0, s[0:1], s2 offset:0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    ; return to shader part epilog
+  %1 = or i32 %soffset, 16
+  %zext.offset = zext i32 %1 to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
+  %load = load i8, ptr addrspace(1) %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
 define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
 ; GCN-LABEL: s_buffer_load_byte_imm_offset:
 ; GCN:       ; %bb.0: ; %main_body
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index fa51dc374be230..d2ac462e0cf577 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -1730,7 +1730,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a
 ;
 ; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_u8 s0, s[2:3], s4 offset:-0x18
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[2:3], 24
+; GFX12-NEXT:    s_load_u8 s0, s[0:1], s4 offset:0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index 77fd0bc058aca5..6671ed1bc5ae79 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -55,7 +55,8 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX12-LABEL: prefetch_data_sgpr_min_offset:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], -0x800000, null, 0
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 0x800000
+; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_data_sgpr_min_offset:
@@ -217,7 +218,8 @@ entry:
 define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX12-LABEL: prefetch_inst_sgpr_min_offset:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_inst s[0:1], -0x800000, null, 0
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], 0x800000
+; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: prefetch_inst_sgpr_min_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index ba8ebb44b9c0d5..e2dfbb8fad8ede 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -88,7 +88,7 @@ entry:
   ret void
 }
 
-; GFX9_10 can use a signed immediate byte offset but not without sgpr[offset]
+; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0