[llvm] [AMDGPU][SILoadStoreOptimizer] Try to find common base for L/Ss with 0 offset (PR #71126)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 3 15:48:02 PDT 2023


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/71126

>From 9606ab21d52a78c5e7bbaaf8787d90907bf54457 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 3 Nov 2023 12:49:58 -0700
Subject: [PATCH 1/2] [AMDGPU] Rebase on top of commit showing effect of
 pull/71126

Change-Id: I22da752020837fd21eae8ada20bf0995d9e6b120
---
 .../AMDGPU/promote-constOffset-to-imm.mir     | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
index 91b312dfdf95d5d..50656785363d91e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
@@ -212,3 +212,104 @@ body:             |
     %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
     GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, implicit $exec
 ...
+---
+
+# GFX9-LABEL: name: offset0_rebased
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -3072, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+# GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
+
+name: offset0_rebased
+body:             |
+  bb.0.entry:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1.sub0, 0, implicit $exec
+    %4:vgpr_32 = COPY %0.sub1
+    %5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 %4, %1.sub1, killed %3, 0, implicit $exec
+    %7:sreg_32 = S_MOV_B32 1024
+    %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %7, %2, 0, implicit $exec
+    %10:vgpr_32, %11:sreg_64 = V_ADDC_U32_e64 0, %5, killed %9, 0, implicit $exec
+    %12:vreg_64_align2 = REG_SEQUENCE killed %8, %subreg.sub0, killed %10, %subreg.sub1
+    %13:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %12, 0, 0, implicit $exec
+    %14:sreg_32 = S_MOV_B32 4096
+    %15:vgpr_32, %16:sreg_64_xexec = V_ADD_CO_U32_e64 %14, %2, 0, implicit $exec
+    %17:vgpr_32, %18:sreg_64 = V_ADDC_U32_e64 0, %5, killed %16, 0, implicit $exec
+    %19:vreg_64_align2 = REG_SEQUENCE killed %15, %subreg.sub0, killed %17, %subreg.sub1
+    %20:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %19, 0, 0, implicit $exec
+    %21:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %5, %subreg.sub1
+    %22:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %21, 0, 0, implicit $exec
+...
+---
+
+# GFX9-LABEL: name: offset0_base
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -1024, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+# GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
+
+name: offset0_base
+body:             |
+  bb.0.entry:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1.sub0, 0, implicit $exec
+    %4:vgpr_32 = COPY %0.sub1
+    %5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 %4, %1.sub1, killed %3, 0, implicit $exec
+    %7:sreg_32 = S_MOV_B32 2048
+    %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %7, %2, 0, implicit $exec
+    %10:vgpr_32, %11:sreg_64 = V_ADDC_U32_e64 0, %5, killed %9, 0, implicit $exec
+    %12:vreg_64_align2 = REG_SEQUENCE killed %8, %subreg.sub0, killed %10, %subreg.sub1
+    %13:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %12, 0, 0, implicit $exec
+    %14:sreg_32 = S_MOV_B32 3072
+    %15:vgpr_32, %16:sreg_64_xexec = V_ADD_CO_U32_e64 %14, %2, 0, implicit $exec
+    %17:vgpr_32, %18:sreg_64 = V_ADDC_U32_e64 0, %5, killed %16, 0, implicit $exec
+    %19:vreg_64_align2 = REG_SEQUENCE killed %15, %subreg.sub0, killed %17, %subreg.sub1
+    %20:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %19, 0, 0, implicit $exec
+    %21:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %5, %subreg.sub1
+    %22:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %21, 0, 0, implicit $exec
+...
+
+---
+
+# GFX9-LABEL: name: two_bases
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+# GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], -4096, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
+# GFX9: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_3:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed [[REG_SEQUENCE7]], 0, 0
+
+
+name: two_bases
+body:             |
+  bb.0.entry:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1.sub0, 0, implicit $exec
+    %4:vgpr_32 = COPY %0.sub1
+    %5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 %4, %1.sub1, killed %3, 0, implicit $exec
+    %7:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %5, %subreg.sub1
+    %8:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %7, 0, 0, implicit $exec
+    %9:vgpr_32 = V_LSHLREV_B32_e64 2, %10:vgpr_32(s32), implicit $exec
+    %11:sreg_32 = S_MOV_B32 4096
+    %12:vgpr_32, %13:sreg_64_xexec = V_ADD_CO_U32_e64 %11, %2, 0, implicit $exec
+    %14:vgpr_32, %15:sreg_64 = V_ADDC_U32_e64 0, %5, killed %13, 0, implicit $exec
+    %16:vreg_64_align2 = REG_SEQUENCE killed %12, %subreg.sub0, killed %14, %subreg.sub1
+    %17:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %16, 0, 0, implicit $exec
+    %18:sreg_32 = S_MOV_B32 8192
+    %19:vgpr_32, %20:sreg_64_xexec = V_ADD_CO_U32_e64 %18, %2, 0, implicit $exec
+    %21:vgpr_32, %22:sreg_64 = V_ADDC_U32_e64 0, %5, killed %20, 0, implicit $exec
+    %23:vreg_64_align2 = REG_SEQUENCE killed %19, %subreg.sub0, killed %21, %subreg.sub1
+    %24:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %23, 0, 0, implicit $exec
+    %25:sreg_32 = S_MOV_B32 12288
+    %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %25, %2, 0, implicit $exec
+    %28:vgpr_32, %29:sreg_64 = V_ADDC_U32_e64 0, %5, killed %27, 0, implicit $exec
+    %30:vreg_64_align2 = REG_SEQUENCE killed %26, %subreg.sub0, killed %28, %subreg.sub1
+    %31:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %30, 0, 0, implicit $exec
+
+...

>From 3b75c87fc10fc0d05bb59dbf451006a87200c417 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 2 Nov 2023 11:28:48 -0700
Subject: [PATCH 2/2] [AMDGPU][SILoadStoreOptimizer] Try to find common base
 for L/Ss with 0 offset

Change-Id: I68c3c4a049680254fd1340de7428c37b9d4e56c8
---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  37 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      | 465 +++++++++---------
 .../AMDGPU/promote-constOffset-to-imm.mir     |  21 +-
 3 files changed, 262 insertions(+), 261 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 17105965471f65b..765ef52d5573be2 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -263,7 +263,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                            int32_t NewOffset) const;
   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
-  std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
+  int32_t extractConstOffset(const MachineOperand &Op) const;
   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
   /// Promotes constant offset to the immediate by adjusting the base. It
   /// tries to use a base from the nearby instructions that allows it to have
@@ -1968,18 +1968,18 @@ void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
 }
 
-std::optional<int32_t>
+int32_t
 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
   if (Op.isImm())
     return Op.getImm();
 
   if (!Op.isReg())
-    return std::nullopt;
+    return 0;
 
   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
       !Def->getOperand(1).isImm())
-    return std::nullopt;
+    return 0;
 
   return Def->getOperand(1).getImm();
 }
@@ -2019,13 +2019,15 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
 
+  // If we are unable to find an offset by looking through BaseLo, then default
+  // to 0 offset with BaseLo as the base.
+  BaseLo = Def->getOperand(1);
   auto Offset0P = extractConstOffset(*Src0);
   if (Offset0P)
     BaseLo = *Src1;
   else {
-    if (!(Offset0P = extractConstOffset(*Src1)))
-      return;
-    BaseLo = *Src0;
+    if ((Offset0P = extractConstOffset(*Src1)))
+      BaseLo = *Src0;
   }
 
   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
@@ -2034,17 +2036,20 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   if (Src0->isImm())
     std::swap(Src0, Src1);
 
-  if (!Src1->isImm())
-    return;
-
-  uint64_t Offset1 = Src1->getImm();
-  BaseHi = *Src0;
+  // If w,e are unable to find an offset by looking through BaseHi, then default
+  // to 0 offset with BaseHi as the base.
+  int64_t Offset1 = 0;
+  BaseHi = Def->getOperand(3);
+  if (Src1->isImm()) {
+    Offset1 = Src1->getImm();
+    BaseHi = *Src0;
+  }
 
   Addr.Base.LoReg = BaseLo.getReg();
   Addr.Base.HiReg = BaseHi.getReg();
   Addr.Base.LoSubReg = BaseLo.getSubReg();
   Addr.Base.HiSubReg = BaseHi.getSubReg();
-  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
+  Addr.Offset = (Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
 }
 
 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
@@ -2082,12 +2087,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   } else
     MAddr = Visited[&MI];
 
-  if (MAddr.Offset == 0) {
-    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
-                         " constant offsets that can be promoted.\n";);
-    return false;
-  }
-
   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3cb03099da93d51..f74faa667cfd381 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -110,53 +110,53 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v20
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s1, 0x2000
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-4096
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:2048
+; GFX9-NEXT:    s_movk_i32 s1, 0x3000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-4096
+; GFX9-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[2:3], off offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v7, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: clmem_read_simplified:
@@ -185,46 +185,45 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, 0x800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, 0x1800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, 0x2800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x3000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off
+; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, 0x3800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
@@ -253,48 +252,51 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:-4096
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[10:11], v[6:7], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v0, 0x3000
 ; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0x2000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[0:1], off
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1148,10 +1150,10 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
-; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:-4096
 ; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024
 ; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048
@@ -1160,10 +1162,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v2, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v0, v5, v8
+; GFX9-NEXT:    v_add3_u32 v0, v6, v0, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1201,41 +1202,42 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v9, v[0:1], off offset:1024
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    global_load_dword v9, v[0:1], off
-; GFX10-NEXT:    global_load_dword v10, v[0:1], off offset:1024
-; GFX10-NEXT:    global_load_dword v11, v[2:3], off offset:1024
-; GFX10-NEXT:    global_load_dword v12, v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dword v13, v[4:5], off
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v10, v[2:3], off offset:-2048
+; GFX10-NEXT:    global_load_dword v11, v[2:3], off
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1800, v0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v12, v[4:5], off offset:1024
+; GFX10-NEXT:    global_load_dword v13, v[6:7], off offset:1024
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v14, v[6:7], off offset:1024
-; GFX10-NEXT:    global_load_dword v15, v[2:3], off offset:1024
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:-2048
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dword v2, v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dword v3, v[4:5], off
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:1024
+; GFX10-NEXT:    global_load_dword v2, v[4:5], off offset:1024
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    global_load_dword v14, v[0:1], off offset:1024
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v10, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v9, v10
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add3_u32 v0, v12, v0, v11
+; GFX10-NEXT:    v_add3_u32 v0, v11, v0, v12
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_add3_u32 v0, v13, v0, v14
+; GFX10-NEXT:    v_add3_u32 v0, v6, v0, v13
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add3_u32 v0, v2, v0, v15
+; GFX10-NEXT:    v_add3_u32 v0, v7, v0, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v6
+; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v14
 ; GFX10-NEXT:    global_store_dword v8, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1261,37 +1263,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v7, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v7, v[0:1], off offset:1024
+; GFX11-NEXT:    global_load_b32 v8, v[2:3], off offset:-4096
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    global_load_b32 v9, v[0:1], off offset:2048
 ; GFX11-NEXT:    global_load_b32 v10, v[0:1], off offset:3072
-; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b32 v12, v[2:3], off offset:1024
-; GFX11-NEXT:    global_load_b32 v13, v[2:3], off offset:2048
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v3, v[4:5], off offset:1024
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v3, v[4:5], off
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:2048
+; GFX11-NEXT:    global_load_b32 v4, v[4:5], off offset:3072
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:1024
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v8, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, v7, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v9, v1, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v12
+; GFX11-NEXT:    v_add3_u32 v1, v2, v1, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v13, v1, v2
+; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add3_u32 v0, v3, v1, v0
+; GFX11-NEXT:    v_add3_u32 v0, v5, v1, v0
 ; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1421,32 +1424,33 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0xf000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
@@ -1522,25 +1526,27 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[6:7], v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2158,52 +2164,51 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-4096
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:2048
 ; GFX9-NEXT:    s_movk_i32 s0, 0x2000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
+; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[10:11], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[2:3], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[22:23], v[0:1], off offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v22, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v23, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v24, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ReverseOrder:
@@ -2228,58 +2233,57 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v22
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, 0x800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v0
-; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v14
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off
-; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
+; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[14:15], off
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[12:13], off
 ; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v20, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
+; GFX10-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: ReverseOrder:
@@ -2305,44 +2309,45 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[10:11], v[4:5], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[12:13], v[6:7], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off offset:2048
+; GFX11-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:2048
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off
-; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
-; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[14:15], v[14:15], off offset:2048
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v10, v8
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v11, v9, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v9, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v12, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v13, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v14, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v15, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
index 50656785363d91e..0aaa9ce1547e7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
@@ -218,8 +218,7 @@ body:             |
 # GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
 # GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -3072, 0
 # GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
-# GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
-# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -4096, 0
 
 name: offset0_rebased
 body:             |
@@ -246,10 +245,9 @@ body:             |
 
 # GFX9-LABEL: name: offset0_base
 # GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
-# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -1024, 0
-# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
-# GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
-# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 2048, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 3072, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
 
 name: offset0_base
 body:             |
@@ -276,13 +274,12 @@ body:             |
 ---
 
 # GFX9-LABEL: name: two_bases
-# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
-# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -4096, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]],  0, 0
 # GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
-# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], -4096, 0
-# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
-# GFX9: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1
-# GFX9: [[GLOBAL_LOAD_DWORDX4_3:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed [[REG_SEQUENCE7]], 0, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], -4096, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_3:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
 
 
 name: two_bases



More information about the llvm-commits mailing list