[llvm] 07a6d73 - [AMDGPU] CodeGen for GFX12 VFLAT, VSCRATCH and VGLOBAL instructions (#75493)

via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 15 06:01:48 PST 2023


Author: Mirko BrkuĊĦanin
Date: 2023-12-15T15:01:40+01:00
New Revision: 07a6d73664c73ed01215079e00ddbad3ead70119

URL: https://github.com/llvm/llvm-project/commit/07a6d73664c73ed01215079e00ddbad3ead70119
DIFF: https://github.com/llvm/llvm-project/commit/07a6d73664c73ed01215079e00ddbad3ead70119.diff

LOG: [AMDGPU] CodeGen for GFX12 VFLAT, VSCRATCH and VGLOBAL instructions (#75493)

Added: 
    llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
    llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/FLATInstructions.td
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
    llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
    llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
    llvm/test/CodeGen/AMDGPU/offset-split-global.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 22e8c6086f087e..51bd9b63c127ed 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2486,6 +2486,11 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
+def int_amdgcn_flat_atomic_fmin_num   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax_num   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 98d90814f223c3..b0eac567ec9f18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1166,6 +1166,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
   if (isNoUnsignedWrap(Addr))
     return true;
 
+  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+  // values.
+  if (AMDGPU::isGFX12Plus(*Subtarget))
+    return true;
+
   auto LHS = Addr.getOperand(0);
   auto RHS = Addr.getOperand(1);
 
@@ -1701,7 +1706,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   }
 
   VAddr = Addr;
-  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
   return true;
 }
 
@@ -1769,7 +1774,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
           VOffset = SDValue(VMov, 0);
           SAddr = LHS;
-          Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+          Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
           return true;
         }
       }
@@ -1809,7 +1814,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
     }
 
     if (SAddr) {
-      Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+      Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
       return true;
     }
   }
@@ -1825,7 +1830,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
   VOffset = SDValue(VMov, 0);
-  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
   return true;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 121026aca60355..eaf72d7157ee2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -642,6 +642,10 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
 defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
+defm int_amdgcn_flat_atomic_fmin_num : noret_op;
+defm int_amdgcn_flat_atomic_fmax_num : noret_op;
+defm int_amdgcn_global_atomic_fmin_num : noret_op;
+defm int_amdgcn_global_atomic_fmax_num : noret_op;
 
 multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
   let HasNoUse = true in

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d8a41ae1800e3e..66d0e807e02845 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4663,9 +4663,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_global_atomic_csub:
     case Intrinsic::amdgcn_global_atomic_fmin:
     case Intrinsic::amdgcn_global_atomic_fmax:
+    case Intrinsic::amdgcn_global_atomic_fmin_num:
+    case Intrinsic::amdgcn_global_atomic_fmax_num:
     case Intrinsic::amdgcn_flat_atomic_fadd:
     case Intrinsic::amdgcn_flat_atomic_fmin:
     case Intrinsic::amdgcn_flat_atomic_fmax:
+    case Intrinsic::amdgcn_flat_atomic_fmin_num:
+    case Intrinsic::amdgcn_flat_atomic_fmax_num:
     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
       return getDefaultMappingAllVGPR(MI);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index fc05f14744c0f9..beb670669581f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -241,9 +241,13 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_ds_fadd>;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d754058d4f0645..53a9fc2d8e7f66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1026,6 +1026,8 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fmax_num:
+  case Intrinsic::amdgcn_flat_atomic_fmin_num:
     OpIndexes.push_back(0);
     return true;
   default:
@@ -1100,7 +1102,9 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmax:
-  case Intrinsic::amdgcn_flat_atomic_fmin: {
+  case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fmax_num:
+  case Intrinsic::amdgcn_flat_atomic_fmin_num: {
     Type *DestTy = II->getType();
     Type *SrcTy = NewV->getType();
     unsigned NewAS = SrcTy->getPointerAddressSpace();

diff  --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f3771bff247fd8..0dd2b3f5c2c912 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1072,19 +1072,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
   (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
 >;
 
-multiclass FlatAtomicPat <string inst, string node, ValueType vt,
-                          ValueType data_vt = vt> {
-  defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
-  defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
-
-  def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-    (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
+                          ValueType data_vt = vt, bit isIntr = 0> {
+  defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size));
 
   let AddedComplexity = 1 in
   def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
     (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
 }
 
+multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
+                             ValueType data_vt = vt, bit isIntr = 0> {
+  defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size));
+
+  def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
+    (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatAtomicPat <string inst, string node, ValueType vt,
+                          ValueType data_vt = vt, bit isIntr = 0> :
+  FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>,
+  FlatAtomicNoRtnPat<inst, node, vt, data_vt, isIntr>;
+
+multiclass FlatAtomicIntrNoRtnPat <string inst, string node, ValueType vt,
+                                 ValueType data_vt = vt> {
+  defm : FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
+multiclass FlatAtomicIntrRtnPat <string inst, string node, ValueType vt,
+                                ValueType data_vt = vt> {
+  defm : FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
+multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
+                              ValueType data_vt = vt> :
+  FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>,
+  FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+
 class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
                                ValueType vt, ValueType data_vt = vt> : GCNPat <
   (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
@@ -1305,10 +1329,10 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
 multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
                                          ValueType data_vt = vt> {
   let AddedComplexity = 11 in
-  def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<PatFrags>(node), vt, data_vt>;
+  def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;
 
   let AddedComplexity = 13 in
-  def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<PatFrags>(node), vt, data_vt>;
+  def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
 }
 
 multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
@@ -1508,10 +1532,14 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
 let OtherPredicates = [isGFX10Plus] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
 defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
 defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
+}
+
+let OtherPredicates = [isGFX10GFX11] in {
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+
 defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
 defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
 }
@@ -1527,6 +1555,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_f
 defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
 }
 
+let OtherPredicates = [isGFX12Only] in {
+  defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
+  defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
+  defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+  defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+}
+
 let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
 defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
 defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ec82beaf37fb97..aced46bac9e51e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1230,9 +1230,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_atomic_fadd:
   case Intrinsic::amdgcn_global_atomic_fmin:
   case Intrinsic::amdgcn_global_atomic_fmax:
+  case Intrinsic::amdgcn_global_atomic_fmin_num:
+  case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax:
+  case Intrinsic::amdgcn_flat_atomic_fmin_num:
+  case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
   case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1315,6 +1319,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax:
+  case Intrinsic::amdgcn_flat_atomic_fmin_num:
+  case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
   case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
   case Intrinsic::amdgcn_global_atomic_csub: {
@@ -8642,8 +8648,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_global_atomic_fmin:
   case Intrinsic::amdgcn_global_atomic_fmax:
+  case Intrinsic::amdgcn_global_atomic_fmin_num:
+  case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_flat_atomic_fmin:
-  case Intrinsic::amdgcn_flat_atomic_fmax: {
+  case Intrinsic::amdgcn_flat_atomic_fmax:
+  case Intrinsic::amdgcn_flat_atomic_fmin_num:
+  case Intrinsic::amdgcn_flat_atomic_fmax_num: {
     MemSDNode *M = cast<MemSDNode>(Op);
     SDValue Ops[] = {
       M->getOperand(0), // Chain
@@ -8653,12 +8663,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     unsigned Opcode = 0;
     switch (IntrID) {
     case Intrinsic::amdgcn_global_atomic_fmin:
-    case Intrinsic::amdgcn_flat_atomic_fmin: {
+    case Intrinsic::amdgcn_global_atomic_fmin_num:
+    case Intrinsic::amdgcn_flat_atomic_fmin:
+    case Intrinsic::amdgcn_flat_atomic_fmin_num: {
       Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
       break;
     }
     case Intrinsic::amdgcn_global_atomic_fmax:
-    case Intrinsic::amdgcn_flat_atomic_fmax: {
+    case Intrinsic::amdgcn_global_atomic_fmax_num:
+    case Intrinsic::amdgcn_flat_atomic_fmax:
+    case Intrinsic::amdgcn_flat_atomic_fmax_num: {
       Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
       break;
     }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f819172b4ffbf0..46ffc6100b292f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8689,16 +8689,13 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
        AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
     return false;
 
-  bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
-  if (ST.hasNegativeScratchOffsetBug() &&
-      FlatVariant == SIInstrFlags::FlatScratch)
-    AllowNegative = false;
   if (ST.hasNegativeUnalignedScratchOffsetBug() &&
       FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
       (Offset % 4) != 0) {
     return false;
   }
 
+  bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
   unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
   return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
 }
@@ -8709,12 +8706,10 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
                              uint64_t FlatVariant) const {
   int64_t RemainderOffset = COffsetVal;
   int64_t ImmField = 0;
-  bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
-  if (ST.hasNegativeScratchOffsetBug() &&
-      FlatVariant == SIInstrFlags::FlatScratch)
-    AllowNegative = false;
 
+  bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
   const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
+
   if (AllowNegative) {
     // Use signed division by a power of two to truncate towards 0.
     int64_t D = 1LL << NumBits;
@@ -8738,6 +8733,14 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
   return {ImmField, RemainderOffset};
 }
 
+bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
+  if (ST.hasNegativeScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch)
+    return false;
+
+  return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
+}
+
 static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
   switch (ST.getGeneration()) {
   default:

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 02b3422bccabfa..affe5204675209 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1294,6 +1294,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                               unsigned AddrSpace,
                                               uint64_t FlatVariant) const;
 
+  /// Returns true if negative offsets are allowed for the given \p FlatVariant.
+  bool allowNegativeFlatOffset(uint64_t FlatVariant) const;
+
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index 3f931412527300..83266f8d8386ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) {
   ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index ec2cd43e5fb5df..4603fbcd525c78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_kernel:
@@ -74,6 +75,22 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_sindex_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -144,6 +161,19 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_vindex_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -214,6 +244,20 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_vindex_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -254,6 +298,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_ptr_foo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
   ret void
@@ -336,6 +387,23 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:260 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_sindex_small_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -417,6 +485,20 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_vindex_small_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -503,6 +585,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_vindex_small_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x100
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -595,6 +693,23 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_sindex_large_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -678,6 +793,20 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_vindex_large_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -766,6 +895,22 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_vindex_large_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -839,6 +984,17 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_large_imm_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
@@ -909,6 +1065,18 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_large_imm_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
@@ -979,6 +1147,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: store_load_vidx_sidx_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)
   %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1033,6 +1215,17 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_i64_aligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
   %load = load volatile i64, ptr addrspace(5) %arg, align 8
@@ -1082,6 +1275,17 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_i64_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
   %load = load volatile i64, ptr addrspace(5) %arg, align 1
@@ -1147,6 +1351,20 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_v3i32_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s2, 3
+; GFX12-NEXT:    s_mov_b32 s1, 2
+; GFX12-NEXT:    s_mov_b32 s0, 1
+; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
   %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
@@ -1217,6 +1435,21 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: store_load_v4i32_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s3, 4
+; GFX12-NEXT:    s_mov_b32 s2, 3
+; GFX12-NEXT:    s_mov_b32 s1, 2
+; GFX12-NEXT:    s_mov_b32 s0, 1
+; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
   %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 5516741e57e929..bfb2ecde783a63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
 
 define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index 5787392a5901ac..d2c42292a03642 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
index f1c3673ae29dd3..bfe806f47c86ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
@@ -4,6 +4,8 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
 
 ---
 name:            amdgpu_atomic_cmpxchg_s32_flat
@@ -53,6 +55,16 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -126,6 +138,16 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -185,6 +207,16 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
+    ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = COPY $vgpr4_vgpr5
@@ -258,6 +290,16 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
+    ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = COPY $vgpr4_vgpr5
@@ -349,6 +391,16 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX12-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -404,6 +456,15 @@ body:             |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX11-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX12-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -456,6 +517,15 @@ body:             |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX11-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
+    ; GFX12-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
+    ; GFX12-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = COPY $vgpr4_vgpr5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
index e1ef96bec0fdba..4dba133bf110d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
@@ -6,6 +6,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 name:            amdgpu_atomic_cmpxchg_s32_global

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
index 868f08d805caa4..5dbc5288e38244 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
@@ -1,9 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX7,GCN %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX7,GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX9,GCN,LARGE_IOFFSET %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX10,GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GCN,LARGE_IOFFSET %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX12,GCN,LARGE_IOFFSET %s
 
 ---
 name:            flat_atomicrmw_add_s32
@@ -14,37 +15,13 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX7-LABEL: name: flat_atomicrmw_add_s32
-    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
-    ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
-    ;
-    ; GFX10-LABEL: name: flat_atomicrmw_add_s32
-    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ; GCN-LABEL: name: flat_atomicrmw_add_s32
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GCN-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GCN-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 0)
@@ -61,33 +38,12 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX7-LABEL: name: flat_atomicrmw_add_s32_nortn
-    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ;
-    ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn
-    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_nortn
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GCN-LABEL: name: flat_atomicrmw_add_s32_nortn
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GCN-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 0)
@@ -119,13 +75,13 @@ body:             |
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2047
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; LARGE_IOFFSET-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -142,14 +98,6 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -183,12 +131,12 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -204,13 +152,6 @@ body:             |
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -244,13 +185,13 @@ body:             |
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2048
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; LARGE_IOFFSET-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -267,14 +208,6 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2048
@@ -308,12 +241,12 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -329,13 +262,6 @@ body:             |
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2048
@@ -369,13 +295,13 @@ body:             |
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset4095
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; LARGE_IOFFSET-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -392,14 +318,6 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -433,12 +351,12 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -454,13 +372,6 @@ body:             |
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -541,6 +452,14 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
+    ; GFX12-LABEL: name: flat_atomicrmw_add_s32_offset4097
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4097, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -618,6 +537,13 @@ body:             |
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
+    ; GFX12-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4097, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -635,37 +561,13 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
 
-    ; GFX7-LABEL: name: flat_atomicrmw_add_s64
-    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
-    ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s64
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
-    ;
-    ; GFX10-LABEL: name: flat_atomicrmw_add_s64
-    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s64
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ; GCN-LABEL: name: flat_atomicrmw_add_s64
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GCN-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 0)
@@ -682,33 +584,12 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
 
-    ; GFX7-LABEL: name: flat_atomicrmw_add_s64_nortn
-    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ;
-    ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn
-    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s64_nortn
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GCN-LABEL: name: flat_atomicrmw_add_s64_nortn
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GCN-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 0)
@@ -740,13 +621,13 @@ body:             |
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s64_offset4095
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; LARGE_IOFFSET-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; LARGE_IOFFSET-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -763,14 +644,6 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -804,12 +677,12 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ;
-    ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
-    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; LARGE_IOFFSET-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
+    ; LARGE_IOFFSET: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; LARGE_IOFFSET-NEXT: {{  $}}
+    ; LARGE_IOFFSET-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; LARGE_IOFFSET-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; LARGE_IOFFSET-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -825,13 +698,6 @@ body:             |
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
-    ;
-    ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
-    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_CONSTANT i64 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
index 43d9b911b9f335..637a7d080dd79d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
@@ -5,6 +5,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 name:            global_atomicrmw_add_s32
@@ -59,6 +60,14 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 1)
@@ -114,6 +123,13 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s32), addrspace 1)
@@ -181,6 +197,14 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2047
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -246,6 +270,13 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -323,6 +354,14 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2048
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2048
@@ -396,6 +435,13 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2048
@@ -473,6 +519,14 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4095
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -546,6 +600,13 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -640,6 +701,14 @@ body:             |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4097
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4097, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -730,6 +799,13 @@ body:             |
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4097, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -791,6 +867,14 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 1)
@@ -846,6 +930,13 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s64_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst (s64), addrspace 1)
@@ -921,6 +1012,14 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s64_offset4095
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -994,6 +1093,13 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_CONSTANT i64 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index de21788ff168f7..a6aa1be8addc71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -3,6 +3,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 
@@ -42,6 +43,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_s32_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst (s32), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -86,6 +94,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_v2s16_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst (<2 x s16>), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -130,6 +145,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_p3_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = G_LOAD %0 :: (load seq_cst (p3), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -174,6 +196,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_s64_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst (s64), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -218,6 +247,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_v2s32_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -262,6 +298,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_v4s16_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst (<4 x s16>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -306,6 +349,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_p1_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = G_LOAD %0 :: (load seq_cst (p1), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -350,6 +400,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_p0_seq_cst
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(p0) = G_LOAD %0 :: (load seq_cst (p0), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -426,6 +483,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
     %2:vgpr(p0) = G_PTR_ADD %0, %1
@@ -488,6 +552,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
     %2:vgpr(p0) = G_PTR_ADD %0, %1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index b678966de85377..1705b74f6395a8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -5,6 +5,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir
index 11c0d8beab3031..553e8c773b466b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir
@@ -4,6 +4,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index d7c32543988626..eb62d805334558 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -4,6 +4,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 
 ---
@@ -51,6 +52,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -102,6 +110,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -153,6 +168,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -204,6 +226,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v2s32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -255,6 +284,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v3s32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<3 x  s32>) = G_LOAD %0 :: (load (<3 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -306,6 +342,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v4s32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x  s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -357,6 +400,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s64
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -408,6 +458,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v2s64
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -459,6 +516,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: load_flat_v2p1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -510,6 +574,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
+    ; GFX12-LABEL: name: load_flat_s96
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s96) = G_LOAD %0 :: (load (s96), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -561,6 +632,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
+    ; GFX12-LABEL: name: load_flat_s128
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -612,6 +690,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_flat_p3_from_4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -663,6 +748,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_p1_from_8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -714,6 +806,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
+    ; GFX12-LABEL: name: load_flat_p999_from_8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -765,6 +864,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX12-LABEL: name: load_flat_v2p3
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -816,6 +922,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v2s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -867,6 +980,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v4s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -918,6 +1038,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
+    ; GFX12-LABEL: name: load_flat_v6s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<6 x  s16>) = G_LOAD %0 :: (load (<6 x s16>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -969,6 +1096,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
+    ; GFX12-LABEL: name: load_flat_v8s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<8 x  s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1048,6 +1182,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2047
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2047
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1125,6 +1266,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2048
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2048
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1218,6 +1366,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m2047
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2047
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1311,6 +1466,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m2048
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1388,6 +1550,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_4095
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1481,6 +1650,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_4096
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1574,6 +1750,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m4095
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4095
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1667,6 +1850,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m4096
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4096
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1760,6 +1950,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_8191
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1853,6 +2050,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_8192
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1946,6 +2150,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m8191
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2039,6 +2250,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_m8192
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2046,3 +2264,419 @@ body: |
     $vgpr0 = COPY %3
 
 ...
+
+---
+
+name: load_flat_s32_from_1_gep_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX7-LABEL: name: load_flat_s32_from_1_gep_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_flat_s32_from_1_gep_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_flat_s32_from_1_gep_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_flat_s32_from_1_gep_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_flat_s32_from_1_gep_24bit_max
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 8388607
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_flat_s32_from_1_gep_2x_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX7-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 16777214
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_flat_s32_from_1_gep_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX7-LABEL: name: load_flat_s32_from_1_gep_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_flat_s32_from_1_gep_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_flat_s32_from_1_gep_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_flat_s32_from_1_gep_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_flat_s32_from_1_gep_24bit_min
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 -8388608
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_flat_s32_from_1_gep_2x_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX7-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_flat_s32_from_1_gep_2x_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 -16777215
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
+    $vgpr0 = COPY %3
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 0103bfc9d39c14..0fb9864b460072 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -2,6 +2,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 name: load_global_s32_from_sgpr
@@ -36,6 +37,14 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(p1) = COPY %0
     %2:vgpr(s32) = G_LOAD %1 :: (load (s32), align 4, addrspace 1)
@@ -78,6 +87,14 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_zext_vgpr
+    ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(p1) = COPY %0
@@ -123,6 +140,14 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_merge_zext_vgpr
+    ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(p1) = COPY %0
@@ -198,6 +223,24 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_merge_not_0_vgpr
+    ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]]
+    ; GFX12-NEXT: %notzero:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %notzero, %subreg.sub1
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1
+    ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(p1) = COPY %0
@@ -261,6 +304,14 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095
+    ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(p1) = COPY %0
@@ -326,6 +377,14 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096
+    ; GFX12: liveins: $sgpr0_sgpr1, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(p1) = COPY %0
@@ -371,6 +430,14 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4096
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4096, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4096
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -413,6 +480,14 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4097
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4097, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4097
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -479,6 +554,14 @@ body: |
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -4097, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -4097
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -521,6 +604,14 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 2049
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -571,6 +662,14 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -2049
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -612,6 +711,14 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4286578688, implicit $exec
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 8388607, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4294967295
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -683,6 +790,24 @@ body: |
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4294967296
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -755,6 +880,24 @@ body: |
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4294971390
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -827,6 +970,24 @@ body: |
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -4294967295
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -898,6 +1059,24 @@ body: |
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
+    ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
+    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -4294967296
     %2:sgpr(p1) = G_PTR_ADD %0, %1
@@ -932,6 +1111,12 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_copy_undef_sgpr
+    ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = G_IMPLICIT_DEF
     %1:vgpr(p1) = COPY %0
     %2:vgpr(s32) = G_LOAD %1 :: (load (s32), align 4, addrspace 1)
@@ -961,6 +1146,11 @@ body: |
     ; GFX11: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_undef_vgpr
+    ; GFX12: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = G_IMPLICIT_DEF
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1)
     $vgpr0 = COPY %1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index 27806edc918088..f8ef9426bc36e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -6,6 +6,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 
@@ -78,6 +79,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1)
     $vgpr0 = COPY %1
@@ -153,6 +161,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_2
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 1)
     $vgpr0 = COPY %1
@@ -228,6 +243,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 1)
     $vgpr0 = COPY %1
@@ -303,6 +325,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_global_v2s32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -378,6 +407,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
+    ; GFX12-LABEL: name: load_global_v4s32
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -453,6 +489,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_global_s64
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -528,6 +571,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
+    ; GFX12-LABEL: name: load_global_v2s64
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -593,6 +643,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
+    ; GFX12-LABEL: name: load_global_v2p1
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -658,6 +715,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
+    ; GFX12-LABEL: name: load_global_s128
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -733,6 +797,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_p3_from_4
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 1)
     $vgpr0 = COPY %1
@@ -808,6 +879,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_global_p1_from_8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -873,6 +951,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
+    ; GFX12-LABEL: name: load_global_p999_from_8
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -938,6 +1023,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX12-LABEL: name: load_global_v2p3
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -1013,6 +1105,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_global_v2s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 1)
     $vgpr0 = COPY %1
@@ -1088,6 +1187,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
+    ; GFX12-LABEL: name: load_global_v4s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -1163,6 +1269,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
+    ; GFX12-LABEL: name: load_global_v8s16
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1258,6 +1371,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2047
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2047
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1359,6 +1479,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2048
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2048
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1468,6 +1595,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2047
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2047
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1577,6 +1711,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2048
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1678,6 +1819,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_4095
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1797,6 +1945,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_4096
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -1914,6 +2069,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4095
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4095
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2031,6 +2193,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4096
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4096
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2150,6 +2319,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_8191
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2269,6 +2445,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_8192
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2402,6 +2585,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8191
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2535,6 +2725,13 @@ body: |
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8192
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192
     %2:vgpr(p1) = G_PTR_ADD %0, %1
@@ -2542,3 +2739,551 @@ body: |
     $vgpr0 = COPY %3
 
 ...
+
+---
+
+name: load_global_s32_from_1_gep_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 8388607
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_global_s32_from_1_gep_2x_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 16777214
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_global_s32_from_1_gep_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 -8388608
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_global_s32_from_1_gep_2x_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
+    ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX11: liveins: $vgpr0_vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 -16777215
+    %2:vgpr(p1) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
+    $vgpr0 = COPY %3
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
index 286b4daa79b93f..ef1bb842cd354c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
@@ -5,6 +5,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index ea320cec9991d5..c8c21978062401 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -4,6 +4,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 
@@ -39,6 +40,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -79,6 +87,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_USHORT:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_USHORT]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_USHORT:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_USHORT]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -119,6 +134,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -159,6 +181,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_private_p3_from_4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -199,6 +228,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p5), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_private_p5_from_4
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p5), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = G_LOAD %0 :: (load (p5), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -240,6 +276,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_private_v2s16
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -288,6 +331,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_2047
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2047
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -327,6 +377,15 @@ body: |
     ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_AND_B32_e64_]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
+    ; GFX12-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_AND_B32_e64_]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2147483647
     %2:vgpr(s32) = G_AND %0, %1
@@ -376,6 +435,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_2048
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2048
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -422,6 +488,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_m2047
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2047
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -468,6 +541,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_m2048
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2048
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -514,6 +594,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_4095
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -562,6 +649,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_4096
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 4096
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -608,6 +702,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_m4095
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -654,6 +755,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_m4096
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4096
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -702,6 +810,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_8191
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 8191
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -750,6 +865,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_8192
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 8192
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -798,6 +920,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_m8191
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -8191
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -846,6 +975,13 @@ body: |
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_m8192
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -8192
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -856,6 +992,230 @@ body: |
 
 ---
 
+name: load_private_s32_from_1_gep_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins:  $vgpr0
+
+    ; GFX6-LABEL: name: load_private_s32_from_1_gep_24bit_max
+    ; GFX6: liveins: $vgpr0
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8388607, implicit $exec
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX9-LABEL: name: load_private_s32_from_1_gep_24bit_max
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8388607, implicit $exec
+    ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX11-LABEL: name: load_private_s32_from_1_gep_24bit_max
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8388607, implicit $exec
+    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_24bit_max
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    %0:vgpr(p5) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 8388607
+    %2:vgpr(p5) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_private_s32_from_1_gep_2x_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins:  $vgpr0
+
+    ; GFX6-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max
+    ; GFX6: liveins: $vgpr0
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX9-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec
+    ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX11-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec
+    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_2x_24bit_max
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16777214, implicit $exec
+    ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    %0:vgpr(p5) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 16777214
+    %2:vgpr(p5) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_private_s32_from_1_gep_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins:  $vgpr0
+
+    ; GFX6-LABEL: name: load_private_s32_from_1_gep_24bit_min
+    ; GFX6: liveins: $vgpr0
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8388608, implicit $exec
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX9-LABEL: name: load_private_s32_from_1_gep_24bit_min
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8388608, implicit $exec
+    ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX11-LABEL: name: load_private_s32_from_1_gep_24bit_min
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8388608, implicit $exec
+    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_24bit_min
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    %0:vgpr(p5) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 -8388608
+    %2:vgpr(p5) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
+name: load_private_s32_from_1_gep_2x_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins:  $vgpr0
+
+    ; GFX6-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min
+    ; GFX6: liveins: $vgpr0
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX9-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec
+    ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
+    ; GFX11-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec
+    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_gep_2x_24bit_min
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16777215, implicit $exec
+    ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    %0:vgpr(p5) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 -16777215
+    %2:vgpr(p5) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 5)
+    $vgpr0 = COPY %3
+
+...
+
+---
+
 name: load_private_s32_from_4_constant_0
 legalized:       true
 regBankSelected: true
@@ -879,6 +1239,11 @@ body: |
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_4_constant_0
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
     %0:vgpr(p5) = G_CONSTANT i32 0
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -910,6 +1275,11 @@ body: |
     ; GFX11: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 16
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_4_constant_sgpr_16
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 16
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]]
     %0:sgpr(p5) = G_CONSTANT i32 16
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -941,6 +1311,11 @@ body: |
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_constant_4095
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -974,6 +1349,11 @@ body: |
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_constant_4096
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -1006,6 +1386,10 @@ body: |
     ; GFX11-LABEL: name: load_private_s32_from_fi
     ; GFX11: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_fi
+    ; GFX12: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0 = COPY %1
@@ -1037,6 +1421,10 @@ body: |
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_fi_offset_4095
+    ; GFX12: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -1071,6 +1459,10 @@ body: |
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
+    ; GFX12: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:sgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(s32) = COPY %1
@@ -1113,6 +1505,10 @@ body: |
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE_SVS:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SVS [[V_MOV_B32_e32_]], %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SVS]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_1_fi_offset_4096
+    ; GFX12: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4096
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -1149,6 +1545,11 @@ body: |
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
+    ;
+    ; GFX12-LABEL: name: load_private_s32_from_neg1
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX12-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD]]
     %0:vgpr(p5) = G_CONSTANT i32 -1
     %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 5)
     $vgpr0 = COPY %1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
index 289d70be83c01b..1d708ef0c7951f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
@@ -3,6 +3,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir
index 827eb13e3e4e84..247be2c85b39f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir
@@ -4,6 +4,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index fc6925ee5709c5..9c13135c0bf172 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -4,6 +4,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 
@@ -50,6 +51,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_4
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store (s32), align 4, addrspace 0)
@@ -100,6 +108,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_2
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store (s16), align 2, addrspace 0)
@@ -150,6 +165,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_1
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 0)
@@ -201,6 +223,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
+    ; GFX12-LABEL: name: store_flat_s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (s64), align 8, addrspace 0)
@@ -251,6 +280,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
+    ; GFX12-LABEL: name: store_flat_s96
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX12-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     G_STORE %1, %0 :: (store (s96), align 16, addrspace 0)
@@ -301,6 +337,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
+    ; GFX12-LABEL: name: store_flat_s128
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (s128), align 16, addrspace 0)
@@ -352,6 +395,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v2s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x s32>), align 8, addrspace 0)
@@ -402,6 +452,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
+    ; GFX12-LABEL: name: store_flat_v3s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX12-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     G_STORE %1, %0 :: (store (<3 x s32>), align 16, addrspace 0)
@@ -452,6 +509,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v4s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<4 x s32>), align 16, addrspace 0)
@@ -503,6 +567,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v2s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = COPY $vgpr2
     G_STORE %1, %0 :: (store (<2 x s16>), align 4, addrspace 0)
@@ -554,6 +625,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v4s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<4 x s16>), align 8, addrspace 0)
@@ -605,6 +683,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
+    ; GFX12-LABEL: name: store_flat_v6s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX12-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     G_STORE %1, %0 :: (store (<6 x s16>), align 16, addrspace 0)
@@ -655,6 +740,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v8s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 0)
@@ -706,6 +798,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v2s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<2 x s64>), align 16, addrspace 0)
@@ -757,6 +856,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
+    ; GFX12-LABEL: name: store_flat_p1
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (p1), align 8, addrspace 0)
@@ -808,6 +914,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v2p1
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<2 x p1>), align 16, addrspace 0)
@@ -859,6 +972,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
+    ; GFX12-LABEL: name: store_flat_p3
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %1, %0 :: (store (p3), align 4, addrspace 0)
@@ -910,6 +1030,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
+    ; GFX12-LABEL: name: store_flat_v2p3
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 8, addrspace 0)
@@ -960,6 +1087,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
+    ; GFX12-LABEL: name: store_atomic_flat_s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store monotonic (s32), align 4, addrspace 0)
@@ -1011,6 +1145,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX11-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
+    ; GFX12-LABEL: name: store_atomic_flat_s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store monotonic (s64), align 8, addrspace 0)
@@ -1086,6 +1227,13 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_gep_2047
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32))
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -1093,3 +1241,418 @@ body: |
     G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
 
 ...
+
+---
+
+name: store_flat_s32_to_1_gep_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX7-LABEL: name: store_flat_s32_to_1_gep_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX8-LABEL: name: store_flat_s32_to_1_gep_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX9-LABEL: name: store_flat_s32_to_1_gep_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX10-LABEL: name: store_flat_s32_to_1_gep_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX11-LABEL: name: store_flat_s32_to_1_gep_24bit_max
+    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_1_gep_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 8388607, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 8388607
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
+
+...
+
+---
+
+name: store_flat_s32_to_1_gep_2x_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX7-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX8-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX9-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX10-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX11-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max
+    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 16777214
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
+
+...
+
+---
+
+name: store_flat_s32_to_1_gep_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX7-LABEL: name: store_flat_s32_to_1_gep_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX8-LABEL: name: store_flat_s32_to_1_gep_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX9-LABEL: name: store_flat_s32_to_1_gep_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX10-LABEL: name: store_flat_s32_to_1_gep_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX11-LABEL: name: store_flat_s32_to_1_gep_24bit_min
+    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_1_gep_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], -8388608, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 -8388608
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
+
+...
+
+---
+
+name: store_flat_s32_to_1_gep_2x_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX7-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX8-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX9-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX10-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX11-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min
+    ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
+    ; GFX12-LABEL: name: store_flat_s32_to_1_gep_2x_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 -16777215
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index 95e01aa413a374..79cd2a07883a67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -6,6 +6,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 
@@ -69,6 +70,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_4
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store (s32), align 4, addrspace 1)
@@ -136,6 +144,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_2
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store (s16), align 2, addrspace 1)
@@ -203,6 +218,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_1
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 1)
@@ -271,6 +293,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (s64), align 8, addrspace 1)
@@ -328,6 +357,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s128
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (s128), align 16, addrspace 1)
@@ -396,6 +432,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v2s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x s32>), align 8, addrspace 1)
@@ -463,6 +506,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v4s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<4 x s32>), align 16, addrspace 1)
@@ -531,6 +581,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v2s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = COPY $vgpr2
     G_STORE %1, %0 :: (store (<2 x s16>), align 4, addrspace 1)
@@ -599,6 +656,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v4s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<4 x s16>), align 8, addrspace 1)
@@ -667,6 +731,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v8s16
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 1)
@@ -735,6 +806,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v2s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<2 x s64>), align 16, addrspace 1)
@@ -803,6 +881,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_p1
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (p1), align 8, addrspace 1)
@@ -861,6 +946,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v2p1
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store (<2 x p1>), align 16, addrspace 1)
@@ -929,6 +1021,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_p3
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %1, %0 :: (store (p3), align 4, addrspace 1)
@@ -987,6 +1086,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_v2p3
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 8, addrspace 1)
@@ -1054,6 +1160,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_atomic_global_s32
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store monotonic (s32), align 4, addrspace 1)
@@ -1122,6 +1235,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_atomic_global_s64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store monotonic (s64), align 8, addrspace 1)
@@ -1206,6 +1326,13 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_gep_2047
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -1213,3 +1340,490 @@ body: |
     G_STORE %1, %3 :: (store (s32), align 4, addrspace 1)
 
 ...
+
+---
+
+name: store_global_s32_to_1_gep_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX8-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX9-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX10-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_1_gep_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 8388607, 0, implicit $exec :: (store (s32), addrspace 1)
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 8388607
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 1)
+...
+
+---
+
+name: store_global_s32_to_1_gep_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX8-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX9-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX10-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_1_gep_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], -8388608, 0, implicit $exec :: (store (s32), addrspace 1)
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 -8388608
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 1)
+
+...
+
+---
+
+name: store_global_s32_to_1_gep_2x_24bit_max
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX8-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX9-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX10-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_1_gep_2x_24bit_max
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 16777214
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 1)
+
+...
+
+---
+
+name: store_global_s32_to_1_gep_2x_24bit_min
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT-NEXT: {{  $}}
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX8-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
+    ; GFX9-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX10-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
+    ; GFX12-LABEL: name: store_global_s32_to_1_gep_2x_24bit_min
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    %0:vgpr(p1) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(s64) = G_CONSTANT i64 -16777215
+    %3:vgpr(p1) = G_PTR_ADD %0, %2
+    G_STORE %1, %3 :: (store (s32), align 4, addrspace 1)
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
index 54774ff512cb62..557fedb5d73718 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
@@ -5,6 +5,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 
 # 96-bit load/store is only legal for SI, so split the test out.
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index 4f801759c38e1a..ed14d0598c8d9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -4,6 +4,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 ---
 
@@ -26,18 +27,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_4
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_4
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_4
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (s32), align 4, addrspace 5)
@@ -65,18 +75,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_2
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_2
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (s16), align 2, addrspace 5)
@@ -104,18 +123,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_1
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_1
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_1
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (s8), align 1, addrspace 5)
@@ -143,18 +171,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_v2s16
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_v2s16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (<2 x s16>), align 4, addrspace 5)
@@ -182,18 +219,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_p3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_p3
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_p3
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (p3), align 4, addrspace 5)
@@ -221,18 +267,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_p5
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_p5
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_p5
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5)
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (p5), align 4, addrspace 5)
@@ -258,12 +313,18 @@ body: |
     ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -291,13 +352,20 @@ body: |
     ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_1_constant_4095
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
@@ -324,14 +392,21 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_1_constant_4096
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_1_constant_4096
+    ; GFX12: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
@@ -358,18 +433,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_s32_to_4
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_s32_to_4
     ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_s32_to_4
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (s32), align 4, addrspace 5)
@@ -396,18 +480,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_s32_to_2
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_s32_to_2
     ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_s32_to_2
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (s16), align 2, addrspace 5)
@@ -434,18 +527,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_s32_to_1
     ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_s32_to_1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (s8), align 1, addrspace 5)
@@ -473,18 +575,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_v2s16
     ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_v2s16
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 5)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (<2 x s16>), align 4, addrspace 5)
@@ -511,18 +622,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_p3
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_p3
     ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_p3
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 5)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (p3), align 4, addrspace 5)
@@ -549,18 +669,27 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_p5
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (p5), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_p5
     ; GFX11: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_p5
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p5), addrspace 5)
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store (p5), align 4, addrspace 5)
@@ -588,16 +717,24 @@ body: |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE_SADDR [[V_MOV_B32_e32_]], %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -627,17 +764,26 @@ body: |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_s32_to_1_constant_4095
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_s32_to_1_constant_4095
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
@@ -666,18 +812,27 @@ body: |
     ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 5)
+    ;
     ; GFX11-LABEL: name: kernel_store_private_s32_to_1_constant_4096
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: kernel_store_private_s32_to_1_constant_4096
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_BYTE [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
@@ -703,17 +858,26 @@ body: |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
     G_STORE %0, %1 :: (store (s32), align 4, addrspace 5)
@@ -740,11 +904,13 @@ body: |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -754,6 +920,13 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4095, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
     %2:sgpr(s32) = G_CONSTANT i32 4095
@@ -797,6 +970,13 @@ body: |
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4095, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
     %2:vgpr(s32) = G_CONSTANT i32 4095
@@ -838,6 +1018,7 @@ body: |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ;
     ; GFX11-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
     ; GFX11: liveins: $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -847,6 +1028,13 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
     ; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
+    ;
+    ; GFX12-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX12-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4096, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
     %2:sgpr(s32) = G_CONSTANT i32 4096

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index 324839482976e9..e67f3620d013c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -4,6 +4,7 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
 
 ---
 name: test_load_flat_s1_align1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
index c0fb02fb184cf9..d6acc6ecdfc660 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
@@ -16,6 +16,8 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-HSA %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-HSA %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s
 
 # ERR-NOT: remark
 # ERR: remark: <unknown>:0:0: unable to legalize instruction: %{{[0-9]+}}:_(<2 x s32>) = G_LOAD %{{[0-9]+}}:_(p1) :: (load (<2 x s16>), align 1, addrspace 1) (in function: test_extload_global_v2s32_from_v2s16_align1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
index a94df99206304a..1249de647bb759 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
@@ -9,6 +9,8 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10-UNALIGNED %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -mattr=-unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11-UNALIGNED %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -mattr=-unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11-UNALIGNED %s
 
 ---
 name: test_load_local_s1_align1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 40884567f9962a..23a0524b69ffa5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -5,6 +5,7 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX11 %s
 
 ---
 name: test_load_private_s1_align1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
index c2fdbff77868f0..f20e481ccd4b78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
@@ -5,6 +5,7 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
 
 ---
 name: test_store_global_s1_align1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 4ab963ed85ccad..d76c9e4c1c1346 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
 
 define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub:
@@ -17,6 +18,13 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_atomic_csub:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
   ret i32 %ret
 }
@@ -39,6 +47,13 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_atomic_csub_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
   ret i32 %ret
@@ -58,6 +73,13 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_atomic_csub_nortn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
   ret void
 }
@@ -80,6 +102,13 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_atomic_csub_offset_nortn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
   ret void
@@ -112,6 +141,18 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
   store i32 %ret, ptr addrspace(1) undef
@@ -139,6 +180,14 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index f45d6d0696498c..1df9a250a3159d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s
 
 define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_system:
@@ -85,6 +86,32 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX1100-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: syncscope_system:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX1200-NEXT:    s_mov_b32 s0, 0
+; GFX1200-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    buffer_gl1_inv
+; GFX1200-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1200-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1200-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1200-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val seq_cst
   ret float %res
 }
@@ -175,6 +202,15 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: syncscope_workgroup_rtn:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret float %res
 }
@@ -296,6 +332,16 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: syncscope_workgroup_nortn:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX1200-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret void
 }
@@ -374,6 +420,31 @@ define float @no_unsafe(ptr %addr, float %val) {
 ; GFX1100-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-LABEL: no_unsafe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX1200-NEXT:    s_mov_b32 s0, 0
+; GFX1200-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1200-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1200-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1200-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret float %res
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 2456da62187737..a8dcb74c7d5f92 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -4,6 +4,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX12 %s
 
 ; Should not merge this to a dword load
 define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
@@ -47,6 +48,13 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_load_2xi16_align2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 2
   %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
@@ -112,6 +120,16 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_2xi16_align2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
   store i16 1, ptr addrspace(1) %r, align 2
   store i16 2, ptr addrspace(1) %gep.r, align 2
@@ -172,6 +190,13 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_load_2xi16_align1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 1
   %p.1 = load i16, ptr addrspace(1) %gep.p, align 1
@@ -248,6 +273,16 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_2xi16_align1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
   store i16 1, ptr addrspace(1) %r, align 1
   store i16 2, ptr addrspace(1) %gep.r, align 1
@@ -290,6 +325,13 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_load_2xi16_align4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 4
   %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
@@ -349,6 +391,16 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_2xi16_align4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
   store i16 1, ptr addrspace(1) %r, align 4
   store i16 2, ptr addrspace(1) %gep.r, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 08e36413322b64..61f0e5e2a88928 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -7,6 +7,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-FLASTSCR %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11-FLASTSCR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12-FLASTSCR %s
 
 ; Should not merge this to a dword load
 define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
@@ -70,6 +72,20 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_load_2xi16_align2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FLASTSCR-LABEL: private_load_2xi16_align2:
+; GFX12-FLASTSCR:       ; %bb.0:
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 2
   %p.1 = load i16, ptr addrspace(5) %gep.p, align 2
@@ -144,6 +160,20 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
 ; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_store_2xi16_align2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX12-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FLASTSCR-LABEL: private_store_2xi16_align2:
+; GFX12-FLASTSCR:       ; %bb.0:
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX12-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
   store i16 1, ptr addrspace(5) %r, align 2
   store i16 2, ptr addrspace(5) %gep.r, align 2
@@ -222,6 +252,20 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_load_2xi16_align1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FLASTSCR-LABEL: private_load_2xi16_align1:
+; GFX12-FLASTSCR:       ; %bb.0:
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 1
   %p.1 = load i16, ptr addrspace(5) %gep.p, align 1
@@ -301,6 +345,20 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
 ; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_store_2xi16_align1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX12-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FLASTSCR-LABEL: private_store_2xi16_align1:
+; GFX12-FLASTSCR:       ; %bb.0:
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX12-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
   store i16 1, ptr addrspace(5) %r, align 1
   store i16 2, ptr addrspace(5) %gep.r, align 1
@@ -364,6 +422,20 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_load_2xi16_align4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FLASTSCR-LABEL: private_load_2xi16_align4:
+; GFX12-FLASTSCR:       ; %bb.0:
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 4
   %p.1 = load i16, ptr addrspace(5) %gep.p, align 2
@@ -435,6 +507,20 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r
 ; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: private_store_2xi16_align4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX12-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FLASTSCR-LABEL: private_store_2xi16_align4:
+; GFX12-FLASTSCR:       ; %bb.0:
+; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX12-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
   store i16 1, ptr addrspace(5) %r, align 4
   store i16 2, ptr addrspace(5) %gep.r, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index ad4d4a4a30fc6d..93e8630dc7f560 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
 
 ; vgpr offset
 
@@ -22,6 +23,13 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
   %load = load i8, ptr addrspace(5) %gep, align 4
   %ext = zext i8 %load to i32
@@ -47,6 +55,13 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_i8 v0, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
   %load = load i8, ptr addrspace(5) %gep, align 4
   %ext = sext i8 %load to i32
@@ -72,6 +87,13 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_zext_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_u16 v0, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
   %load = load i16, ptr addrspace(5) %gep, align 4
   %ext = zext i16 %load to i32
@@ -97,6 +119,13 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_sext_v:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_i16 v0, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
   %load = load i16, ptr addrspace(5) %gep, align 4
   %ext = sext i16 %load to i32
@@ -125,6 +154,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -155,6 +192,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -185,6 +230,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
   %load_lo = load i16, ptr addrspace(5) %gep
@@ -215,6 +268,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v3, -1
+; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -245,6 +306,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v3, -1
+; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -275,6 +344,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v3, -1
+; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
   %load_lo = load i16, ptr addrspace(5) %gep
@@ -303,6 +380,13 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspac
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_store_d16_hi_b8 v2, v0, off offset:4
+; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
   %element = extractelement <4 x i8> %load, i32 2
@@ -331,6 +415,13 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2
+; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
   %element = extractelement <2 x i16> %load, i32 1
@@ -362,6 +453,13 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_u8 v2, off, s0 offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
   %load = load i8, ptr addrspace(5) %gep, align 4
   %ext = zext i8 %load to i32
@@ -387,6 +485,13 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_i8 v2, off, s0 offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
   %load = load i8, ptr addrspace(5) %gep, align 4
   %ext = sext i8 %load to i32
@@ -412,6 +517,13 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_zext_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_u16 v2, off, s0 offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
   %load = load i16, ptr addrspace(5) %gep, align 4
   %ext = zext i16 %load to i32
@@ -437,6 +549,13 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_sext_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_i16 v2, off, s0 offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
   %load = load i16, ptr addrspace(5) %gep, align 4
   %ext = sext i16 %load to i32
@@ -466,6 +585,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_u8 v2, off, s0 offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -497,6 +624,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_i8 v2, off, s0 offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -528,6 +663,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_b16 v2, off, s0 offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
   %load_lo = load i16, ptr addrspace(5) %gep
@@ -559,6 +702,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    scratch_load_d16_hi_u8 v2, off, s0 offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -590,6 +741,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    scratch_load_d16_hi_i8 v2, off, s0 offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
   %load_lo = load i8, ptr addrspace(5) %gep
@@ -621,6 +780,14 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    scratch_load_d16_hi_b16 v2, off, s0 offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
 bb:
   %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
   %load_lo = load i16, ptr addrspace(5) %gep
@@ -649,6 +816,13 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspac
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_store_d16_hi_b8 off, v0, s0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_store_d16_hi_b8 off, v0, s0 offset:4
+; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
   %element = extractelement <4 x i8> %load, i32 2
@@ -677,6 +851,13 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_store_d16_hi_b16 off, v0, s0 offset:2
+; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
   %element = extractelement <2 x i16> %load, i32 1
@@ -710,6 +891,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_svs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
   %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
@@ -739,6 +928,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_svs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    scratch_load_i8 v0, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
   %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
@@ -768,6 +965,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_zext_svs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    scratch_load_u16 v0, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
   %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
@@ -797,6 +1002,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_sext_svs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    scratch_load_i16 v0, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v0
+; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
   %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
@@ -830,6 +1043,15 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
@@ -865,6 +1087,15 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
@@ -900,6 +1131,15 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
@@ -935,6 +1175,15 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, -1
+; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
@@ -970,6 +1219,15 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, -1
+; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
@@ -1005,6 +1263,15 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, -1
+; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[1:2], v3
+; GFX12-NEXT:    s_endpgm
 bb:
   %voffset4 = mul i32 %voffset, 4
   %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
@@ -1038,6 +1305,14 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_store_d16_hi_b8 v1, v0, off offset:4
+; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
   %element = extractelement <4 x i8> %load, i32 2
@@ -1071,6 +1346,14 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_store_d16_hi_b16 v1, v0, off offset:2
+; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
   %element = extractelement <2 x i16> %load, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 8d28c7845167a2..04eb6dcff4632b 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefixes=GFX940-GISEL
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-SDAG
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-GISEL
 
 ; Test flat scratch SVS addressing mode with various combinations of alignment
 ; of soffset, voffset and inst_offset.
@@ -86,6 +88,36 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff1_voff1:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff1_voff1:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff1 = mul i32 %soff, 1
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -182,6 +214,38 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff1_voff2:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff1_voff2:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff1 = mul i32 %soff, 1
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -278,6 +342,38 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff1_voff4:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff1_voff4:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff1 = mul i32 %soff, 1
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -375,6 +471,40 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff2_voff1:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff2_voff1:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff2 = mul i32 %soff, 2
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -475,6 +605,40 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff2_voff2:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff2_voff2:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff2 = mul i32 %soff, 2
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -575,6 +739,40 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff2_voff4:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff2_voff4:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff2 = mul i32 %soff, 2
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -672,6 +870,40 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff4_voff1:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff4_voff1:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff4 = mul i32 %soff, 4
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -772,6 +1004,40 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff4_voff2:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff4_voff2:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff4 = mul i32 %soff, 4
   %a = alloca i8, i32 64, align 4, addrspace(5)
@@ -870,6 +1136,40 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: soff4_voff4:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: soff4_voff4:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff4 = mul i32 %soff, 4
   %a = alloca i8, i32 64, align 4, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index f1a2fe88677533..98379f5e3c68b4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -2,11 +2,13 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-PAL %s
 
 define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-LABEL: zero_init_kernel:
@@ -63,6 +65,22 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: zero_init_kernel:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: zero_init_kernel:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -165,6 +183,22 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: zero_init_kernel:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-PAL-NEXT:    s_clause 0x3
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX12-PAL-NEXT:    s_endpgm
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
@@ -223,6 +257,23 @@ define void @zero_init_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: zero_init_foo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: zero_init_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -290,6 +341,23 @@ define void @zero_init_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: zero_init_foo:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-PAL-NEXT:    s_clause 0x3
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
@@ -350,6 +418,22 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_sindex_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_sindex_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
@@ -428,6 +512,22 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_sindex_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -487,6 +587,20 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_sindex_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_sindex_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -558,6 +672,20 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_sindex_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -610,6 +738,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_vindex_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_vindex_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -671,6 +810,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_vindex_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -724,6 +874,19 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_vindex_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_vindex_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -777,6 +940,19 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_vindex_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
+; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -810,6 +986,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: private_ptr_foo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: private_ptr_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -839,6 +1022,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: private_ptr_foo:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
   ret void
@@ -905,6 +1095,24 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: zero_init_small_offset_kernel:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -1017,6 +1225,24 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-PAL-NEXT:    s_clause 0x3
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX12-PAL-NEXT:    s_endpgm
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
@@ -1084,6 +1310,25 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: zero_init_small_offset_foo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: zero_init_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1159,6 +1404,25 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: zero_init_small_offset_foo:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-PAL-NEXT:    s_clause 0x3
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
@@ -1228,6 +1492,24 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_sindex_small_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
@@ -1343,6 +1625,24 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1412,6 +1712,22 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_sindex_small_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -1518,6 +1834,22 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1579,6 +1911,18 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_vindex_small_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -1672,6 +2016,18 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1737,6 +2093,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_vindex_small_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x100
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1802,6 +2173,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_vindex_small_offset_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s32, 0x100
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1879,6 +2265,24 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: zero_init_large_offset_kernel:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16388
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16404
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16420
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16436
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -1996,6 +2400,24 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-PAL-NEXT:    s_clause 0x3
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16388
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16404
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16420
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16436
+; GFX12-PAL-NEXT:    s_endpgm
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
@@ -2074,6 +2496,25 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: zero_init_large_offset_foo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16384
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16400
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16416
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16432
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2190,6 +2631,25 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: zero_init_large_offset_foo:
+; GFX12-PAL:       ; %bb.0:
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-PAL-NEXT:    s_clause 0x3
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16384
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16400
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16416
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16432
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
@@ -2259,6 +2719,24 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_sindex_large_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
@@ -2374,6 +2852,24 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -2443,6 +2939,22 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_sindex_large_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -2549,6 +3061,22 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -2611,6 +3139,18 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_vindex_large_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -2706,6 +3246,18 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -2772,6 +3324,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_vindex_large_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2839,6 +3406,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_vindex_large_offset_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -2900,6 +3482,17 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_large_imm_offset_kernel:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -2993,6 +3586,17 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
@@ -3050,6 +3654,18 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_large_imm_offset_foo:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3108,6 +3724,18 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_large_imm_offset_foo:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
@@ -3166,6 +3794,19 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: store_load_vidx_sidx_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
@@ -3234,6 +3875,19 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: store_load_vidx_sidx_offset:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)
   %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -3278,6 +3932,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_i64_aligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_i64_aligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3320,6 +3984,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_i64_aligned:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
   %load = load volatile i64, ptr addrspace(5) %arg, align 8
@@ -3359,6 +4033,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_i64_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_i64_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3401,6 +4085,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_i64_unaligned:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
   %load = load volatile i64, ptr addrspace(5) %arg, align 1
@@ -3443,6 +4137,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_v3i32_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3489,6 +4194,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_v3i32_unaligned:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
   %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
@@ -3533,6 +4249,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_v4i32_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3582,6 +4309,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_v4i32_unaligned:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
   %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1
@@ -3620,6 +4358,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_i32_negative_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3672,6 +4420,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
   store volatile i8 1, ptr addrspace(5) %ptr, align 1
@@ -3712,6 +4470,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: store_load_i32_large_negative_unaligned:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3765,6 +4533,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
   store volatile i8 1, ptr addrspace(5) %ptr, align 1
@@ -3842,6 +4620,25 @@ define amdgpu_ps void @large_offset() {
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: large_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v1
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-PAL-LABEL: large_offset:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
@@ -3940,6 +4737,25 @@ define amdgpu_ps void @large_offset() {
 ; GFX11-PAL-NEXT:    ; use v1
 ; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: large_offset:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
+; GFX12-PAL-NEXT:    ;;#ASMSTART
+; GFX12-PAL-NEXT:    ; use v0
+; GFX12-PAL-NEXT:    ;;#ASMEND
+; GFX12-PAL-NEXT:    ;;#ASMSTART
+; GFX12-PAL-NEXT:    ; use v1
+; GFX12-PAL-NEXT:    ;;#ASMEND
+; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
   %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 9eb1b484e461a8..f1879f28766786 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_add_i64_offset:
@@ -32,6 +33,19 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -76,6 +90,22 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -121,6 +151,24 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -170,6 +218,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -204,6 +269,19 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -243,6 +321,22 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -283,6 +377,24 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -327,6 +439,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -364,6 +493,19 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -408,6 +550,22 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -453,6 +611,24 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -502,6 +678,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -536,6 +729,19 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -575,6 +781,22 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -615,6 +837,24 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -659,6 +899,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -696,6 +953,19 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -740,6 +1010,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -785,6 +1071,24 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -834,6 +1138,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -868,6 +1189,19 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -907,6 +1241,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -947,6 +1297,24 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -991,6 +1359,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -1026,6 +1411,18 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1070,6 +1467,21 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1113,6 +1525,23 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -1162,6 +1591,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -1194,6 +1639,18 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -1233,6 +1690,21 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr %out2
@@ -1271,6 +1743,23 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1315,6 +1804,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1350,6 +1855,18 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1394,6 +1911,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1437,6 +1969,23 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -1486,6 +2035,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -1518,6 +2083,18 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -1557,6 +2134,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr %out2
@@ -1595,6 +2187,23 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1639,6 +2248,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1674,6 +2299,18 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1718,6 +2355,21 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1761,6 +2413,23 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -1810,6 +2479,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -1842,6 +2527,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -1881,6 +2578,21 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr %out2
@@ -1919,6 +2631,23 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1963,6 +2692,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1998,6 +2743,18 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -2042,6 +2799,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
@@ -2085,6 +2857,23 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -2134,6 +2923,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -2166,6 +2971,18 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -2205,6 +3022,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr %out2
@@ -2243,6 +3075,23 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -2287,6 +3136,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -2324,6 +3189,19 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -2368,6 +3246,22 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -2413,6 +3307,24 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -2462,6 +3374,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -2496,6 +3425,19 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -2535,6 +3477,22 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -2575,6 +3533,24 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -2619,6 +3595,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -2656,6 +3649,19 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -2692,6 +3698,19 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst
@@ -2728,6 +3747,19 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_pointer_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr %out, i32 4
   %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst
@@ -2772,6 +3804,22 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -2817,6 +3865,24 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -2866,6 +3932,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -2900,6 +3983,19 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -2939,6 +4035,22 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -2979,6 +4091,24 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -3023,6 +4153,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -3060,6 +4207,19 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -3104,6 +4264,22 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -3149,6 +4325,24 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -3198,6 +4392,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -3232,6 +4443,19 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -3271,6 +4495,22 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -3311,6 +4551,24 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -3355,6 +4613,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -3394,6 +4669,19 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %in, i64 4
   %val = load atomic i64, ptr %gep  seq_cst, align 8
@@ -3429,6 +4717,19 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8
   store i64 %val, ptr %out
@@ -3475,6 +4776,24 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %in, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -3519,6 +4838,24 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index)
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %in, i64 %index
   %val = load atomic i64, ptr %ptr seq_cst, align 8
@@ -3552,6 +4889,15 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   store atomic i64 %in, ptr %gep  seq_cst, align 8
@@ -3580,6 +4926,15 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   store atomic i64 %in, ptr %out seq_cst, align 8
   ret void
@@ -3619,6 +4974,20 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -3656,6 +5025,20 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   store atomic i64 %in, ptr %ptr seq_cst, align 8
@@ -3698,6 +5081,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -3740,6 +5139,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_soffset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 9000
   %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -3786,6 +5201,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -3834,6 +5264,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -3889,6 +5335,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -3930,6 +5395,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
@@ -3971,6 +5452,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -4014,6 +5510,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -4064,6 +5576,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -4104,6 +5635,19 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr %in, i64 4
   %val = load atomic double, ptr %gep  seq_cst, align 8
@@ -4139,6 +5683,19 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8
   store double %val, ptr %out
@@ -4185,6 +5742,24 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr %in, i64 %index
   %gep = getelementptr double, ptr %ptr, i64 4
@@ -4229,6 +5804,24 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index)
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr %in, i64 %index
   %val = load atomic double, ptr %ptr seq_cst, align 8
@@ -4262,6 +5855,15 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr %out, i64 4
   store atomic double %in, ptr %gep  seq_cst, align 8
@@ -4290,6 +5892,15 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   store atomic double %in, ptr %out seq_cst, align 8
   ret void
@@ -4329,6 +5940,20 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr %out, i64 %index
   %gep = getelementptr double, ptr %ptr, i64 4
@@ -4366,6 +5991,20 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr %out, i64 %index
   store atomic double %in, ptr %ptr seq_cst, align 8
@@ -4402,6 +6041,19 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -4446,6 +6098,22 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -4491,6 +6159,24 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_incr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -4540,6 +6226,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -4574,6 +6277,19 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -4613,6 +6329,22 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -4653,6 +6385,24 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_incr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -4697,6 +6447,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_incr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -4734,6 +6501,19 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -4778,6 +6558,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
@@ -4823,6 +6619,24 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_decr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -4872,6 +6686,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
@@ -4906,6 +6737,19 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -4945,6 +6789,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
@@ -4985,6 +6845,24 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_decr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
@@ -5029,6 +6907,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_decr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
new file mode 100644
index 00000000000000..5c3e83c4b9cfe1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
+
+declare float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data)
+declare float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data)
+
+define amdgpu_cs void @flat_atomic_fmin_num_f32_noret(ptr %ptr, float %data) {
+; GFX12-LABEL: flat_atomic_fmin_num_f32_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
+  %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data)
+  ret void
+}
+
+define amdgpu_cs void @flat_atomic_fmax_num_f32_noret(ptr %ptr, float %data) {
+; GFX12-LABEL: flat_atomic_fmax_num_f32_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
+  %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data)
+  ret void
+}
+
+define amdgpu_cs float @flat_atomic_fmin_num_f32_rtn(ptr %ptr, float %data, ptr %out) {
+; GFX12-LABEL: flat_atomic_fmin_num_f32_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
+  %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data)
+  store float %ret, ptr %out
+  ret float %ret
+}
+
+define amdgpu_cs float @flat_atomic_fmax_num_f32_rtn(ptr %ptr, float %data, ptr %out) {
+; GFX12-LABEL: flat_atomic_fmax_num_f32_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
+  %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data)
+  store float %ret, ptr %out
+  ret float %ret
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
new file mode 100644
index 00000000000000..cf069ee92639f8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc < %s -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
+
+declare float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+declare float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+
+define amdgpu_cs void @global_atomic_fmin_num_f32_noret(ptr addrspace(1) %ptr, float %data) {
+; GFX12-LABEL: global_atomic_fmin_num_f32_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+  ret void
+}
+
+define amdgpu_cs void @global_atomic_fmax_num_f32_noret(ptr addrspace(1) %ptr, float %data) {
+; GFX12-LABEL: global_atomic_fmax_num_f32_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+  ret void
+}
+
+define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) {
+; GFX12-LABEL: global_atomic_fmax_num_f32_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+  store float %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) {
+; GFX12-LABEL: global_atomic_fmin_num_f32_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+  store float %ret, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index c2b51467edd08d..8cd25298d74739 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -1,21 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
 
 define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
-  ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
-  ; GFX908_GFX11: bb.0 (%ir-block.0):
-  ; GFX908_GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX908_GFX11-NEXT: {{  $}}
-  ; GFX908_GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX908_GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX908_GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX908_GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; GFX908_GFX11-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
-  ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
+  ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
+  ; GFX908_GFX11_GFX12-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908_GFX11_GFX12-NEXT: {{  $}}
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX908_GFX11_GFX12-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+  ; GFX908_GFX11_GFX12-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -33,17 +34,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1)
 }
 
 define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
-  ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
-  ; GFX908_GFX11: bb.0 (%ir-block.0):
-  ; GFX908_GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
-  ; GFX908_GFX11-NEXT: {{  $}}
-  ; GFX908_GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX908_GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX908_GFX11-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX908_GFX11-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX908_GFX11-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
-  ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
+  ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
+  ; GFX908_GFX11_GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT: {{  $}}
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908_GFX11_GFX12-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908_GFX11_GFX12-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+  ; GFX908_GFX11_GFX12-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -61,17 +62,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspa
 }
 
 define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
-  ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
-  ; GFX908_GFX11: bb.0 (%ir-block.0):
-  ; GFX908_GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX908_GFX11-NEXT: {{  $}}
-  ; GFX908_GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX908_GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX908_GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX908_GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; GFX908_GFX11-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
-  ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
+  ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
+  ; GFX908_GFX11_GFX12-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908_GFX11_GFX12-NEXT: {{  $}}
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX908_GFX11_GFX12-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+  ; GFX908_GFX11_GFX12-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -89,17 +90,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspac
 }
 
 define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
-  ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
-  ; GFX908_GFX11: bb.0 (%ir-block.0):
-  ; GFX908_GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
-  ; GFX908_GFX11-NEXT: {{  $}}
-  ; GFX908_GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX908_GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX908_GFX11-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX908_GFX11-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX908_GFX11-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
-  ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
+  ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
+  ; GFX908_GFX11_GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT: {{  $}}
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908_GFX11_GFX12-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908_GFX11_GFX12-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+  ; GFX908_GFX11_GFX12-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -117,17 +118,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr ad
 }
 
 define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) #0 {
-  ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
-  ; GFX908_GFX11: bb.0 (%ir-block.0):
-  ; GFX908_GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX908_GFX11-NEXT: {{  $}}
-  ; GFX908_GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX908_GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX908_GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX908_GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; GFX908_GFX11-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
-  ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
+  ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
+  ; GFX908_GFX11_GFX12-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908_GFX11_GFX12-NEXT: {{  $}}
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908_GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908_GFX11_GFX12-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX908_GFX11_GFX12-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
+  ; GFX908_GFX11_GFX12-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -145,6 +146,68 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
 }
 
 define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
+  ; GFX908-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
+  ; GFX908: bb.0 (%ir-block.0):
+  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+  ; GFX908-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX908-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+  ; GFX908-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1 (%ir-block.5):
+  ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+  ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+  ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX908-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
+  ; GFX908-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
+  ; GFX908-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+  ; GFX908-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   S_BRANCH %bb.2
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2 (%ir-block.35):
+  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1
+  ; GFX908-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.3.Flow:
+  ; GFX908-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.4 (%ir-block.37):
+  ; GFX908-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
@@ -206,6 +269,60 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37):
   ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
+  ; GFX11_GFX12: bb.0 (%ir-block.0):
+  ; GFX11_GFX12-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+  ; GFX11_GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX11_GFX12-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX11_GFX12-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX11_GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX11_GFX12-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32 = SI_PS_LIVE
+  ; GFX11_GFX12-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   S_BRANCH %bb.1
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT: bb.1 (%ir-block.5):
+  ; GFX11_GFX12-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo
+  ; GFX11_GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX11_GFX12-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
+  ; GFX11_GFX12-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+  ; GFX11_GFX12-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 356, 15, 15, 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11_GFX12-NEXT:   early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
+  ; GFX11_GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+  ; GFX11_GFX12-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   S_BRANCH %bb.2
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT: bb.2 (%ir-block.28):
+  ; GFX11_GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], %1, [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT: bb.3.Flow:
+  ; GFX11_GFX12-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT: {{  $}}
+  ; GFX11_GFX12-NEXT: bb.4 (%ir-block.30):
+  ; GFX11_GFX12-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   S_ENDPGM 0
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index c559750d4f1b46..77be7f506aaf2a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 2099ed7dafc379..de4f748413e6c1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 ; Test using saddr addressing mode of global_*load_* flat instructions.
 
@@ -24,6 +26,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %load = load i8, ptr addrspace(1) %sbase
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
@@ -52,6 +61,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_4095:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -74,6 +90,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_4096:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -96,6 +119,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_4097:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4097
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -126,6 +156,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_neg4096:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -160,6 +197,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_neg4097:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4097
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -194,6 +238,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_neg4098:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4098
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -223,6 +274,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_2048:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -252,6 +310,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2049
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_2049:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2049
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -281,6 +346,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2050
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_2050:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2050
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -303,6 +375,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_neg2048:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -333,6 +412,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_neg2049:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -363,6 +449,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2050
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_neg2050:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2050
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -370,27 +463,109 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
+define amdgpu_ps float @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0x7FFFFF:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7ff000
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_0x7FFFFF:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7ff800
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: global_load_saddr_i8_offset_0x7FFFFF:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7ff000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_0x7FFFFF:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607
+  %load = load i8, ptr addrspace(1) %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s2, 0xff800000
+; GFX9-NEXT:    s_addc_u32 s1, s3, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xff800000, s2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0xff800000, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608
+  %load = load i8, ptr addrspace(1) %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
+; GFX10-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xfffff800
 ; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_4294967295:
+; GFX11-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0xff800000
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -398,8 +573,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1)
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
+define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
@@ -408,7 +583,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
+; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
@@ -416,7 +591,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_4294967296:
+; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -424,6 +599,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000000:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -431,8 +625,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1)
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
+define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0x100000001:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
@@ -441,7 +635,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
+; GFX10-LABEL: global_load_saddr_i8_offset_0x100000001:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
@@ -449,7 +643,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_4294967297:
+; GFX11-LABEL: global_load_saddr_i8_offset_0x100000001:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -457,6 +651,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000001:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -464,8 +677,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1)
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
+define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0x100000FFF:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s2, 0xfff
 ; GFX9-NEXT:    s_addc_u32 s1, s3, 1
@@ -474,7 +687,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
+; GFX10-LABEL: global_load_saddr_i8_offset_0x100000FFF:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x800, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
@@ -482,7 +695,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_4294971391:
+; GFX11-LABEL: global_load_saddr_i8_offset_0x100000FFF:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -490,6 +703,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000FFF:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000FFF:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -497,8 +729,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1)
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
+define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_0x100001000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s2, 0x1000
 ; GFX9-NEXT:    s_addc_u32 s1, s3, 1
@@ -507,7 +739,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
+; GFX10-LABEL: global_load_saddr_i8_offset_0x100001000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
@@ -515,7 +747,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_4294971392:
+; GFX11-LABEL: global_load_saddr_i8_offset_0x100001000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -523,6 +755,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100001000:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100001000:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -530,8 +781,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1)
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
+define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
@@ -541,7 +792,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
+; GFX10-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x800, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
@@ -549,7 +800,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295:
+; GFX11-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -557,6 +808,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0x800000, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388607
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -564,8 +834,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
+define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
@@ -574,7 +844,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
+; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
@@ -582,7 +852,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296:
+; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -590,6 +860,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -597,8 +886,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(1) inreg %sbase) {
-; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
+define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace(1) inreg %sbase) {
+; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000001:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
@@ -607,7 +896,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
+; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000001:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
@@ -615,7 +904,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297:
+; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000001:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -623,6 +912,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000001:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s2, -1
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
   %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
@@ -647,6 +955,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -678,6 +992,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
@@ -721,6 +1041,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096
@@ -753,6 +1079,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrsp
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096
@@ -796,6 +1128,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrsp
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4097
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097
@@ -818,6 +1156,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2047
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
@@ -850,6 +1194,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
@@ -872,6 +1222,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
@@ -904,6 +1260,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrsp
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049
@@ -913,6 +1275,103 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrsp
   ret float %to.vgpr
 }
 
+; Maximum positive offset on gfx12.
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64
+; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x7ff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc, 0x7ff000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607
+  %load = load i8, ptr addrspace(1) %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Minimum offset on gfx12.
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64
+; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xff800000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc, 0xff800000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608
+  %load = load i8, ptr addrspace(1) %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+
 ; Maximum positive offset on gfx9, and immediate needs to be moved lower.
 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
@@ -936,6 +1395,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset
@@ -958,6 +1423,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %sbase.as.int, %zext.offset
@@ -981,6 +1452,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
@@ -1004,6 +1481,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
@@ -1028,6 +1511,12 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add.immoffset = add i64 %sbase.as.int, 128
@@ -1080,6 +1569,28 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    ds_load_b64 v[1:2], v1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    ds_load_b64 v[1:2], v1
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v1, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1124,6 +1635,28 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:42
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    ds_load_b64 v[1:2], v1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] offset:42
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    ds_load_b64 v[1:2], v1
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v1, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:42
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1149,6 +1682,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -1172,6 +1712,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-24
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-24
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
@@ -1196,6 +1743,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
@@ -1221,6 +1775,13 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
@@ -1257,6 +1818,26 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, v0, s2
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v0, v2
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -1292,6 +1873,26 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, v0, s2
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v0, v2
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
@@ -1345,6 +1946,36 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_f32_natural_addressing:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, s2, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v2, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc
+; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
   %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1369,6 +2000,14 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr a
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1396,6 +2035,15 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) i
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
   %zext.offset = zext i32 %voffset to i64
   %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1422,6 +2070,15 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr add
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:400
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:400
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1470,6 +2127,36 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, s2, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v2, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc
+; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{}
   %zext.offset = zext i32 %voffset to i64
   %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1493,6 +2180,12 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -1512,6 +2205,12 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1532,6 +2231,12 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load half, ptr addrspace(1) %gep0
@@ -1550,6 +2255,12 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1569,6 +2280,12 @@ define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i32, ptr addrspace(1) %gep0
@@ -1588,6 +2305,12 @@ define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1608,6 +2331,12 @@ define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load float, ptr addrspace(1) %gep0
@@ -1626,6 +2355,12 @@ define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1645,6 +2380,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <2 x i16>, ptr addrspace(1) %gep0
@@ -1664,6 +2405,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2i16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1684,6 +2431,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <2 x half>, ptr addrspace(1) %gep0
@@ -1702,6 +2455,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2f16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1721,6 +2480,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_p3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load ptr addrspace(3), ptr addrspace(1) %gep0
@@ -1741,6 +2506,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_p3_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1762,6 +2533,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbas
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load double, ptr addrspace(1) %gep0
@@ -1781,6 +2558,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) i
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f64_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1801,6 +2584,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbas
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i64, ptr addrspace(1) %gep0
@@ -1820,6 +2609,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) i
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i64_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1840,6 +2635,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <2 x float>, ptr addrspace(1) %gep0
@@ -1858,6 +2659,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2f32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1877,6 +2684,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <2 x i32>, ptr addrspace(1) %gep0
@@ -1896,6 +2709,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2i32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1916,6 +2735,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <4 x i16>, ptr addrspace(1) %gep0
@@ -1935,6 +2760,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4i16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1955,6 +2786,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <4 x half>, ptr addrspace(1) %gep0
@@ -1974,6 +2811,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4f16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1994,6 +2837,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_p1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load ptr addrspace(1), ptr addrspace(1) %gep0
@@ -2014,6 +2863,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) in
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_p1_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2035,6 +2890,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <3 x float>, ptr addrspace(1) %gep0
@@ -2053,6 +2914,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v3f32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2072,6 +2939,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v3i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <3 x i32>, ptr addrspace(1) %gep0
@@ -2091,6 +2964,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v3i32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2111,6 +2990,12 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v6f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <6 x half>, ptr addrspace(1) %gep0
@@ -2129,6 +3014,12 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v6f16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2148,6 +3039,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <4 x float>, ptr addrspace(1) %gep0
@@ -2166,6 +3063,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4f32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2185,6 +3088,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <4 x i32>, ptr addrspace(1) %gep0
@@ -2204,6 +3113,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4i32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2224,6 +3139,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <2 x i64>, ptr addrspace(1) %gep0
@@ -2243,6 +3164,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2i64_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2263,6 +3190,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i128, ptr addrspace(1) %gep0
@@ -2282,6 +3215,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i128_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2302,6 +3241,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2p1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep0
@@ -2322,6 +3267,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v2p1_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2343,6 +3294,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4p3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep0
@@ -2363,6 +3320,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_v4p3_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2388,6 +3351,12 @@ define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    global_load_i8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_sextload_saddr_i8:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_i8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -2408,6 +3377,12 @@ define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inre
 ; GFX11-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_sextload_saddr_i8_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2429,6 +3404,12 @@ define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    global_load_i16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_sextload_saddr_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_i16 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2449,6 +3430,12 @@ define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_i16 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_sextload_saddr_i16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_i16 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2470,6 +3457,12 @@ define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_zextload_saddr_i8:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -2490,6 +3483,12 @@ define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inre
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_zextload_saddr_i8_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2511,6 +3510,12 @@ define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_zextload_saddr_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2531,6 +3536,12 @@ define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inr
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_zextload_saddr_i16_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2567,6 +3578,14 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: atomic_global_load_saddr_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load atomic i32, ptr addrspace(1) %gep0 seq_cst, align 4
@@ -2597,6 +3616,14 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2628,6 +3655,14 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: atomic_global_load_saddr_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load atomic i64, ptr addrspace(1) %gep0 seq_cst, align 8
@@ -2658,6 +3693,14 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2682,6 +3725,18 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(
 ; GFX11-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2702,6 +3757,18 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr
 ; GFX11-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2727,6 +3794,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zero_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2751,6 +3833,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2774,6 +3871,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2796,6 +3907,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2819,6 +3944,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_u8 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -2842,6 +3981,20 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_u8 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2866,6 +4019,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_i8 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -2889,6 +4058,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_i8 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2915,6 +4100,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2935,6 +4133,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2960,6 +4171,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2984,6 +4210,21 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -3007,6 +4248,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -3029,6 +4286,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -3052,6 +4325,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_u8 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -3075,6 +4364,22 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -3099,6 +4404,23 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_i8 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i8, ptr addrspace(1) %gep0
@@ -3122,6 +4444,23 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -3153,6 +4492,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -3178,6 +4525,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -3196,7 +4551,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:  .LBB128_1: ; %bb3
+; GFX9-NEXT:  .LBB132_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_add_u32 s4, s2, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s3, s1
@@ -3205,7 +4560,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x400
-; GFX9-NEXT:    s_cbranch_scc0 .LBB128_1
+; GFX9-NEXT:    s_cbranch_scc0 .LBB132_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3213,7 +4568,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-NEXT:  .LBB128_1: ; %bb3
+; GFX10-NEXT:  .LBB132_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s4, s2, s0
@@ -3223,7 +4578,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_cmpk_eq_i32 s0, 0x400
-; GFX10-NEXT:    s_cbranch_scc0 .LBB128_1
+; GFX10-NEXT:    s_cbranch_scc0 .LBB132_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -3231,7 +4586,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:  .LBB128_1: ; %bb3
+; GFX11-NEXT:  .LBB132_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s4, s2, s0
@@ -3241,9 +4596,46 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX11-NEXT:    s_cmpk_eq_i32 s0, 0x400
-; GFX11-NEXT:    s_cbranch_scc0 .LBB128_1
+; GFX11-NEXT:    s_cbranch_scc0 .LBB132_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-SDAG-NEXT:  .LBB132_1: ; %bb3
+; GFX12-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX12-SDAG-NEXT:    s_cbranch_scc0 .LBB132_1
+; GFX12-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-GISEL-NEXT:  .LBB132_1: ; %bb3
+; GFX12-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v4, vcc, v0, v2
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT:    v_add_co_u32 v2, vcc, v2, 4
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
+; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0x400, v2
+; GFX12-GISEL-NEXT:    s_cbranch_vccz .LBB132_1
+; GFX12-GISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   br label %bb3
 
@@ -3267,7 +4659,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:  .LBB129_1: ; %bb3
+; GFX9-NEXT:  .LBB133_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_add_u32 s4, s2, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s3, s1
@@ -3279,7 +4671,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x400
 ; GFX9-NEXT:    ; kill: killed $sgpr4 killed $sgpr5
-; GFX9-NEXT:    s_cbranch_scc0 .LBB129_1
+; GFX9-NEXT:    s_cbranch_scc0 .LBB133_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3287,7 +4679,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-NEXT:  .LBB129_1: ; %bb3
+; GFX10-NEXT:  .LBB133_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s4, s2, s0
@@ -3300,7 +4692,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_cmpk_eq_i32 s0, 0x400
 ; GFX10-NEXT:    ; kill: killed $sgpr4 killed $sgpr5
-; GFX10-NEXT:    s_cbranch_scc0 .LBB129_1
+; GFX10-NEXT:    s_cbranch_scc0 .LBB133_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -3308,7 +4700,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:  .LBB129_1: ; %bb3
+; GFX11-NEXT:  .LBB133_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s4, s2, s0
@@ -3320,9 +4712,50 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX11-NEXT:    s_cmpk_eq_i32 s0, 0x400
-; GFX11-NEXT:    s_cbranch_scc0 .LBB129_1
+; GFX11-NEXT:    s_cbranch_scc0 .LBB133_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-SDAG-NEXT:  .LBB133_1: ; %bb3
+; GFX12-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX12-SDAG-NEXT:    s_cbranch_scc0 .LBB133_1
+; GFX12-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-GISEL-NEXT:  .LBB133_1: ; %bb3
+; GFX12-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v4, vcc, v0, v2
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT:    v_add_co_u32 v2, vcc, v2, 4
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
+; GFX12-GISEL-NEXT:    global_load_b32 v6, v[4:5], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0x400, v2
+; GFX12-GISEL-NEXT:    s_cbranch_vccz .LBB133_1
+; GFX12-GISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   br label %bb3
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
index 9cf5b6b9273f34..2a4a3ad2e3bcaa 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s
 
 ; Test using saddr addressing mode of global_*store_* flat instructions.
 
@@ -21,6 +22,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i8_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -45,6 +55,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3] offset:2047
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -70,6 +89,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3] offset:-2048
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -118,6 +146,18 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    ds_load_b64 v[2:3], v2
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -159,6 +199,18 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    ds_load_b64 v[2:3], v2
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1] offset:-120
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -183,6 +235,13 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store i16 %data, ptr addrspace(1) %gep0
@@ -201,6 +260,13 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -220,6 +286,13 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_f16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store half %data, ptr addrspace(1) %gep0
@@ -238,6 +311,13 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -257,6 +337,13 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store i32 %data, ptr addrspace(1) %gep0
@@ -275,6 +362,13 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -294,6 +388,13 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_f32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store float %data, ptr addrspace(1) %gep0
@@ -312,6 +413,13 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -331,6 +439,13 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_p3_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store ptr addrspace(3) %data, ptr addrspace(1) %gep0
@@ -349,6 +464,13 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(ptr addrspa
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -368,6 +490,13 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i64_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store i64 %data, ptr addrspace(1) %gep0
@@ -386,6 +515,13 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -405,6 +541,13 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_f64_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store double %data, ptr addrspace(1) %gep0
@@ -423,6 +566,13 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -442,6 +592,13 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <2 x i32> %data, ptr addrspace(1) %gep0
@@ -460,6 +617,13 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -479,6 +643,13 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <2 x float> %data, ptr addrspace(1) %gep0
@@ -497,6 +668,13 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -516,6 +694,13 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <4 x i16> %data, ptr addrspace(1) %gep0
@@ -534,6 +719,13 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -553,6 +745,13 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <4 x half> %data, ptr addrspace(1) %gep0
@@ -571,6 +770,13 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -590,6 +796,13 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_p1_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store ptr addrspace(1) %data, ptr addrspace(1) %gep0
@@ -608,6 +821,13 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(ptr addrspa
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -627,6 +847,13 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <3 x i32> %data, ptr addrspace(1) %gep0
@@ -645,6 +872,13 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -664,6 +898,13 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <3 x float> %data, ptr addrspace(1) %gep0
@@ -682,6 +923,13 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -701,6 +949,13 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <6 x i16> %data, ptr addrspace(1) %gep0
@@ -719,6 +974,13 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -738,6 +1000,13 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <6 x half> %data, ptr addrspace(1) %gep0
@@ -756,6 +1025,13 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -775,6 +1051,13 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <4 x i32> %data, ptr addrspace(1) %gep0
@@ -793,6 +1076,13 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -812,6 +1102,13 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <4 x float> %data, ptr addrspace(1) %gep0
@@ -830,6 +1127,13 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -849,6 +1153,13 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <2 x i64> %data, ptr addrspace(1) %gep0
@@ -867,6 +1178,13 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -886,6 +1204,13 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <2 x double> %data, ptr addrspace(1) %gep0
@@ -904,6 +1229,13 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -923,6 +1255,13 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <8 x i16> %data, ptr addrspace(1) %gep0
@@ -941,6 +1280,13 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -960,6 +1306,13 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <8 x half> %data, ptr addrspace(1) %gep0
@@ -978,6 +1331,13 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -997,6 +1357,13 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <2 x ptr addrspace(1)> %data, ptr addrspace(1) %gep0
@@ -1015,6 +1382,13 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr addrs
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1034,6 +1408,13 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store <4 x ptr addrspace(3)> %data, ptr addrspace(1) %gep0
@@ -1052,6 +1433,13 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr addrs
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1075,6 +1463,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store atomic i32 %data, ptr addrspace(1) %gep0 seq_cst, align 4
@@ -1093,6 +1488,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1112,6 +1514,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   store atomic i64 %data, ptr addrspace(1) %gep0 seq_cst, align 8
@@ -1130,6 +1539,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1153,6 +1569,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(ptr addrspace(1) i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_d16_hi_b16 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %data.hi = extractelement <2 x i16> %data, i32 1
@@ -1172,6 +1595,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -1192,6 +1622,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr addrsp
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %data.hi = extractelement <2 x i16> %data, i32 1
@@ -1212,6 +1649,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 31de604647a135..3d11c8bc499e23 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_add_i64_offset:
@@ -41,6 +42,18 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -99,6 +112,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -158,6 +188,23 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -225,6 +272,24 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -275,6 +340,18 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -332,6 +409,23 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -388,6 +482,23 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -452,6 +563,24 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -497,6 +626,18 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -555,6 +696,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -614,6 +772,23 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -681,6 +856,24 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -731,6 +924,18 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -788,6 +993,23 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -844,6 +1066,23 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -908,6 +1147,24 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -953,6 +1210,18 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -1011,6 +1280,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -1070,6 +1356,23 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -1137,6 +1440,24 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -1187,6 +1508,18 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -1244,6 +1577,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -1300,6 +1650,23 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -1364,6 +1731,24 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -1403,6 +1788,17 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1458,6 +1854,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1511,6 +1923,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -1575,6 +2003,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -1619,6 +2064,17 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -1673,6 +2129,22 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -1723,6 +2195,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1784,6 +2272,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -1823,6 +2328,17 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1878,6 +2394,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -1931,6 +2463,22 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -1995,6 +2543,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -2039,6 +2604,17 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -2093,6 +2669,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -2143,6 +2735,22 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -2204,6 +2812,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -2243,6 +2868,17 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -2298,6 +2934,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -2351,6 +3003,22 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -2415,6 +3083,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -2459,6 +3144,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -2513,6 +3209,22 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -2563,6 +3275,22 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -2624,6 +3352,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -2663,6 +3408,17 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -2718,6 +3474,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
@@ -2771,6 +3543,22 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -2835,6 +3623,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -2879,6 +3684,17 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
@@ -2933,6 +3749,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -2983,6 +3815,22 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s1
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -3044,6 +3892,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
@@ -3089,6 +3954,18 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -3147,6 +4024,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -3206,6 +4100,23 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -3273,6 +4184,24 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -3323,6 +4252,18 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -3380,6 +4321,23 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -3436,6 +4394,23 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -3500,6 +4475,24 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -3545,6 +4538,18 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -3589,6 +4594,18 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst
@@ -3633,6 +4650,18 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_pointer_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst
@@ -3691,6 +4720,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -3750,6 +4796,23 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -3817,6 +4880,24 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -3867,6 +4948,18 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -3924,6 +5017,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -3980,6 +5090,23 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -4044,6 +5171,24 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -4089,6 +5234,18 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -4147,6 +5304,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -4206,6 +5380,23 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -4273,6 +5464,24 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -4323,6 +5532,18 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
@@ -4380,6 +5601,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
@@ -4436,6 +5674,23 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -4500,6 +5755,24 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
@@ -4558,6 +5831,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -4617,6 +5905,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_soffset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 9000
   %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -4678,6 +5981,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -4742,6 +6061,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -4818,6 +6153,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -4878,6 +6234,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
@@ -4938,6 +6309,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -4999,6 +6386,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -5072,6 +6475,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -5125,6 +6549,20 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %in, i64 4
   %val = load atomic i64, ptr addrspace(1) %gep  seq_cst, align 8
@@ -5179,6 +6617,20 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_neg_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %in, i64 -4
   %val = load atomic i64, ptr addrspace(1) %gep  seq_cst, align 8
@@ -5229,6 +6681,20 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %val = load atomic i64, ptr addrspace(1) %in syncscope("agent") seq_cst, align 8
   store i64 %val, ptr addrspace(1) %out
@@ -5292,6 +6758,25 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -5355,6 +6840,25 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
   %val = load atomic i64, ptr addrspace(1) %ptr seq_cst, align 8
@@ -5419,6 +6923,25 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr addrspace(1) %in, i64 %index
   %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4
@@ -5463,6 +6986,17 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3] offset:32
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   store atomic i64 %in, ptr addrspace(1) %gep  seq_cst, align 8
@@ -5503,6 +7037,17 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   store atomic i64 %in, ptr addrspace(1) %out seq_cst, align 8
   ret void
@@ -5555,6 +7100,22 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -5607,6 +7168,22 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   store atomic i64 %in, ptr addrspace(1) %ptr seq_cst, align 8
@@ -5660,6 +7237,22 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4
@@ -5705,6 +7298,18 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -5763,6 +7368,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -5822,6 +7444,23 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_incr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
@@ -5867,6 +7506,18 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -5925,6 +7576,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
@@ -5984,6 +7652,23 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_decr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
index a046179636cd3a..5c8845bca51416 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS
 
 declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1)
 declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub_rtn:
-; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc
+; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc
+; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
 define amdgpu_ps void @buffer_atomic_csub_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
 main_body:
   %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
@@ -13,7 +15,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub_no_rtn:
-; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen
+; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen
+; GFX12PLUS: buffer_atomic_csub_u32 v0, v1, s[0:3], null idxen
 define amdgpu_ps void @buffer_atomic_csub_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
@@ -21,7 +24,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_rtn:
-; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc
+; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc
+; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
 define amdgpu_ps void @buffer_atomic_csub_off4_slc_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
 main_body:
   %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
@@ -29,7 +33,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_no_rtn:
-; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc
+; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc
+; GFX12PLUS: buffer_atomic_csub_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT
 define amdgpu_ps void @buffer_atomic_csub_off4_slc_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
@@ -37,7 +42,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub_rtn:
-; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
+; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -45,7 +51,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn:
-; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
+; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1]
 define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -53,7 +60,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn:
-; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
+; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
@@ -62,7 +70,8 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn:
-; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
+; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4
 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 5921fe1524720c..047cb3ab400084 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -10,6 +10,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX6-LABEL: private_nontemporal_load_0:
@@ -179,6 +181,34 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_nontemporal_load_0:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT
+; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_nontemporal_load_0:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT
+; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
   %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
@@ -361,6 +391,36 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_nontemporal_load_1:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT
+; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_nontemporal_load_1:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT
+; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -532,6 +592,26 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0 glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_nontemporal_store_0:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_nontemporal_store_0:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
   %val = load i32, ptr addrspace(1) %in, align 4
@@ -708,6 +788,28 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    scratch_store_b32 v0, v1, off glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_nontemporal_store_1:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_nontemporal_store_1:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 5b4a1059aee046..4b1fb295adec2a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -6,6 +6,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @private_volatile_load_0(
 ; GFX6-LABEL: private_volatile_load_0:
@@ -123,6 +125,34 @@ define amdgpu_kernel void @private_volatile_load_0(
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_volatile_load_0:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT
+; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_volatile_load_0:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT
+; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
   %val = load volatile i32, ptr addrspace(5) %in, align 4
@@ -251,6 +281,36 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_volatile_load_1:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT
+; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    s_nop 0
+; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_volatile_load_1:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT
+; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    s_nop 0
+; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -377,6 +437,28 @@ define amdgpu_kernel void @private_volatile_store_0(
 ; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_volatile_store_0:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT
+; GFX12-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_volatile_store_0:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT
+; GFX12-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
   %val = load i32, ptr addrspace(1) %in, align 4
@@ -506,6 +588,30 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX11-CU-NEXT:    scratch_store_b32 v0, v1, off dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_volatile_store_1:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT
+; GFX12-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_volatile_store_1:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT
+; GFX12-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index dcaf7664d5b5aa..5a8814e7b20fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -2,9 +2,11 @@
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
@@ -32,6 +34,13 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 1
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -60,6 +69,13 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -88,6 +104,13 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -121,6 +144,13 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -143,6 +173,63 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
   ret i8 %load
 }
 
+define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
+; GFX9-SDAG-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fffff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fffff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fffff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 8388607
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
 define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
@@ -170,6 +257,13 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -2048
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -202,6 +296,13 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -234,11 +335,58 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
+define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388608
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -8388608
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+
 define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
@@ -262,6 +410,13 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -295,6 +450,13 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -345,6 +507,13 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -367,6 +536,74 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
   ret i8 %load
 }
 
+define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
+; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4094
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4094
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388606
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffe, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 16777214
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
 define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
@@ -394,6 +631,13 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -426,6 +670,13 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -458,11 +709,68 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
+define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff000001, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388607
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -16777215
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
 ; Fill 11-bit low-bits (1ull << 33) | 2047
 define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
@@ -492,6 +800,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -531,6 +848,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589936639
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -565,6 +896,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -604,6 +944,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589936640
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -638,6 +992,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -677,6 +1040,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589938687
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -711,6 +1088,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -750,6 +1136,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589938688
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -784,6 +1184,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -823,6 +1232,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589942783
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -857,6 +1280,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,6 +1328,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589942784
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -931,6 +1377,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -970,6 +1425,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -1005,6 +1474,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1044,6 +1522,20 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -1079,6 +1571,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,6 +1619,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -1153,6 +1668,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,6 +1716,20 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -1227,6 +1765,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1266,6 +1813,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -1301,6 +1862,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1340,6 +1910,20 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -1379,6 +1963,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 1
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -1419,6 +2013,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -1459,6 +2063,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -1504,6 +2118,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1574,6 +2198,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1644,6 +2278,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1714,6 +2358,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1779,6 +2433,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -1824,6 +2488,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1894,6 +2568,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1964,6 +2648,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2034,6 +2728,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2104,6 +2808,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2174,6 +2888,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2199,6 +2925,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589936639
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2244,6 +2983,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2269,6 +3020,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589936640
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2314,6 +3078,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2339,6 +3115,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589938687
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2385,6 +3174,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2410,6 +3211,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589938688
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2456,6 +3270,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2481,6 +3307,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589942783
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2527,6 +3366,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2552,6 +3403,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589942784
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2600,6 +3464,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2625,6 +3502,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2673,6 +3563,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2698,6 +3601,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2746,6 +3662,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2771,6 +3700,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2819,6 +3761,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2844,6 +3799,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2892,6 +3860,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2917,6 +3898,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef
@@ -2965,6 +3959,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2990,6 +3997,19 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr undef

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 137c83a0fd80c9..88be7e54ced107 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -2,9 +2,11 @@
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
@@ -30,6 +32,13 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -56,6 +65,13 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -85,6 +101,13 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: global_inst_valu_offset_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -126,6 +149,13 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: global_inst_valu_offset_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -157,6 +187,72 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
   ret i8 %load
 }
 
+define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
+; GFX9-GISEL-LABEL: global_inst_valu_offset_24bit_max:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fffff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: global_inst_valu_offset_24bit_max:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fffff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_inst_valu_offset_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fffff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_24bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8388607
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_inst_valu_offset_24bit_max:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff800, v0
+; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_inst_valu_offset_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8388607
+  %load = load i8, ptr addrspace(1) %gep, align 4
+  ret i8 %load
+}
+
 define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
@@ -178,6 +274,13 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -206,6 +309,13 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -238,11 +348,57 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
+define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
+; GFX9-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388608
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608
+  %load = load i8, ptr addrspace(1) %gep, align 4
+  ret i8 %load
+}
+
 define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
@@ -267,6 +423,13 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -308,6 +471,13 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -367,6 +537,13 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:16383
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -398,6 +575,83 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
   ret i8 %load
 }
 
+define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
+; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffe, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4094
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff800, v0
+; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2046
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4094
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8388606
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
+  %load = load i8, ptr addrspace(1) %gep, align 4
+  ret i8 %load
+}
+
 define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
@@ -421,6 +675,13 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -453,6 +714,13 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -485,11 +753,96 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-16384
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
+define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
+; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff000001, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff001000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000800, v0
+; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2047
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388607
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
+  %load = load i8, ptr addrspace(1) %gep, align 4
+  ret i8 %load
+}
+
+
 ; Fill 11-bit low-bits (1ull << 33) | 2047
 define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
@@ -532,6 +885,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -558,6 +925,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -605,6 +981,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -631,6 +1021,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -678,6 +1077,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -704,6 +1117,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -751,6 +1173,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -777,6 +1213,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -824,6 +1269,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -850,6 +1309,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -897,6 +1365,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -923,6 +1405,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -970,6 +1461,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -997,6 +1502,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386561
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1044,6 +1558,20 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,6 +1599,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386560
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1118,6 +1655,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1145,6 +1696,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384513
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1192,6 +1752,20 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1219,6 +1793,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384512
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1266,6 +1849,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1293,6 +1890,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380417
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1340,6 +1946,20 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
+; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1367,6 +1987,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380416
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1404,6 +2033,18 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1442,6 +2083,18 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1480,6 +2133,18 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1518,6 +2183,18 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1556,6 +2233,18 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1598,6 +2287,18 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1655,6 +2356,18 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1717,6 +2430,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1755,6 +2480,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1793,6 +2530,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -1835,6 +2584,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1892,6 +2653,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1963,6 +2736,18 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2035,6 +2820,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2059,6 +2859,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2107,6 +2921,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2131,6 +2960,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2179,6 +3022,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2203,6 +3061,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2251,6 +3123,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2275,6 +3162,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2323,6 +3224,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2347,6 +3263,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2395,6 +3325,21 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -2419,6 +3364,20 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192 th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2464,6 +3423,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x7ff
+; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2509,6 +3497,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x800
+; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2554,6 +3571,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0xfff
+; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2599,6 +3645,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1000
+; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2644,6 +3719,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1fff
+; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef
@@ -2689,6 +3793,35 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x2000
+; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) undef


        


More information about the llvm-commits mailing list