[llvm] [AMDGPU][SILoadStoreOptimizer] Try to find common base for L/Ss with 0 offset (PR #71126)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 2 16:34:00 PDT 2023


https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/71126

By collecting these during promoteConstantOffsetToImm , the coherent AddrMode reasoning is aware of them and we improve addressing codegen in some cases.

>From 51854051e45e8669c930b83b6478ac9ecf8c59b3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 2 Nov 2023 11:28:48 -0700
Subject: [PATCH] [AMDGPU][SILoadStoreOptimizer] Try to find common base for
 L/Ss with 0 offset

Change-Id: I2ccb625b2a4d5cfeaad380ec1008ecbad0482f3e
---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  35 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      | 400 +++++++++---------
 .../AMDGPU/promote-constOffset-to-imm.mir     | 123 ++++++
 3 files changed, 345 insertions(+), 213 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 17105965471f65b..36c05c72f427b93 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2023,9 +2023,15 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   if (Offset0P)
     BaseLo = *Src1;
   else {
-    if (!(Offset0P = extractConstOffset(*Src1)))
-      return;
-    BaseLo = *Src0;
+    if ((Offset0P = extractConstOffset(*Src1)))
+      BaseLo = *Src0;
+  }
+
+  // We are unable to find an offset by looking through BaseLo. Try using 0
+  // offset with BaseLo as the base.
+  if (!Offset0P) {
+    Offset0P = 0;
+    BaseLo = Def->getOperand(1);
   }
 
   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
@@ -2034,17 +2040,24 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   if (Src0->isImm())
     std::swap(Src0, Src1);
 
-  if (!Src1->isImm())
-    return;
+  std::optional<uint64_t> Offset1;
+  if (Src1->isImm()) {
+    Offset1 = Src1->getImm();
+    BaseHi = *Src0;
+  }
 
-  uint64_t Offset1 = Src1->getImm();
-  BaseHi = *Src0;
+  // We are unable to find an offset by looking through BaseHi. Try using 0
+  // offset with BaseHi as the base.
+  if (!Offset1) {
+    Offset1 = 0;
+    BaseHi = Def->getOperand(3);
+  }
 
   Addr.Base.LoReg = BaseLo.getReg();
   Addr.Base.HiReg = BaseHi.getReg();
   Addr.Base.LoSubReg = BaseLo.getSubReg();
   Addr.Base.HiSubReg = BaseHi.getSubReg();
-  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
+  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (*Offset1 << 32);
 }
 
 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
@@ -2082,12 +2095,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   } else
     MAddr = Visited[&MI];
 
-  if (MAddr.Offset == 0) {
-    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
-                         " constant offsets that can be promoted.\n";);
-    return false;
-  }
-
   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3cb03099da93d51..4bb619209e6a68b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -110,53 +110,53 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v20
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s1, 0x2000
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-4096
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:2048
+; GFX9-NEXT:    s_movk_i32 s1, 0x3000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-4096
+; GFX9-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[2:3], off offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v7, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: clmem_read_simplified:
@@ -185,46 +185,45 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, 0x800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, 0x1800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, 0x2800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x3000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off
+; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, 0x3800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
@@ -253,48 +252,51 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:-4096
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[10:11], v[6:7], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v0, 0x3000
 ; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0x2000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[0:1], off
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1148,10 +1150,10 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
-; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:-4096
 ; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024
 ; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048
@@ -1160,10 +1162,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v2, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v0, v5, v8
+; GFX9-NEXT:    v_add3_u32 v0, v6, v0, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1201,41 +1202,42 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dword v9, v[0:1], off offset:1024
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    global_load_dword v9, v[0:1], off
-; GFX10-NEXT:    global_load_dword v10, v[0:1], off offset:1024
-; GFX10-NEXT:    global_load_dword v11, v[2:3], off offset:1024
-; GFX10-NEXT:    global_load_dword v12, v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dword v13, v[4:5], off
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v10, v[2:3], off offset:-2048
+; GFX10-NEXT:    global_load_dword v11, v[2:3], off
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1800, v0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v12, v[4:5], off offset:1024
+; GFX10-NEXT:    global_load_dword v13, v[6:7], off offset:1024
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v14, v[6:7], off offset:1024
-; GFX10-NEXT:    global_load_dword v15, v[2:3], off offset:1024
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:-2048
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dword v2, v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dword v3, v[4:5], off
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:1024
+; GFX10-NEXT:    global_load_dword v2, v[4:5], off offset:1024
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    global_load_dword v14, v[0:1], off offset:1024
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v10, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v9, v10
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add3_u32 v0, v12, v0, v11
+; GFX10-NEXT:    v_add3_u32 v0, v11, v0, v12
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_add3_u32 v0, v13, v0, v14
+; GFX10-NEXT:    v_add3_u32 v0, v6, v0, v13
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add3_u32 v0, v2, v0, v15
+; GFX10-NEXT:    v_add3_u32 v0, v7, v0, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v6
+; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v14
 ; GFX10-NEXT:    global_store_dword v8, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1261,37 +1263,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v7, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v7, v[0:1], off offset:1024
+; GFX11-NEXT:    global_load_b32 v8, v[2:3], off offset:-4096
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    global_load_b32 v9, v[0:1], off offset:2048
 ; GFX11-NEXT:    global_load_b32 v10, v[0:1], off offset:3072
-; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b32 v12, v[2:3], off offset:1024
-; GFX11-NEXT:    global_load_b32 v13, v[2:3], off offset:2048
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v3, v[4:5], off offset:1024
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v3, v[4:5], off
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:2048
+; GFX11-NEXT:    global_load_b32 v4, v[4:5], off offset:3072
+; GFX11-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:1024
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v8, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, v7, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v9, v1, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v12
+; GFX11-NEXT:    v_add3_u32 v1, v2, v1, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v13, v1, v2
+; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add3_u32 v0, v3, v1, v0
+; GFX11-NEXT:    v_add3_u32 v0, v5, v1, v0
 ; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2158,52 +2161,51 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v24
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-4096
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:2048
 ; GFX9-NEXT:    s_movk_i32 s0, 0x2000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
+; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[10:11], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[2:3], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[22:23], v[0:1], off offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v22, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v23, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v24, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ReverseOrder:
@@ -2228,58 +2230,57 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v22
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, 0x800
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v0
-; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v14
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off
-; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
+; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[14:15], off
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[12:13], off
 ; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v20, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v21, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
+; GFX10-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: ReverseOrder:
@@ -2305,44 +2306,45 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[10:11], v[4:5], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[12:13], v[6:7], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off offset:2048
+; GFX11-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:2048
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off
-; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
-; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[14:15], v[14:15], off offset:2048
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v10, v8
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v11, v9, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v9, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v12, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v13, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v14, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v15, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
index 91b312dfdf95d5d..c48a61ec88191a5 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
@@ -212,3 +212,126 @@ body:             |
     %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
     GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, implicit $exec
 ...
+---
+
+# GFX9-LABEL: name: offset0_rebased
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -3072, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -4096, 0
+
+name: offset0_rebased
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6
+    %3:sgpr_32 = COPY $sgpr6
+    %2:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2:sgpr_64(p4), 0, 0
+    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2:sgpr_64(p4), 8, 0
+    %8:vgpr_32 = V_LSHL_ADD_U32_e64 %3:sgpr_32, 8, %0:vgpr_32(s32), implicit $exec
+    %90:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %89:vreg_64_align2 = REG_SEQUENCE %8:vgpr_32, %subreg.sub0, killed %90:vgpr_32, %subreg.sub1
+    %13:vreg_64_align2 = V_LSHLREV_B64_e64 4, %89:vreg_64_align2, implicit $exec
+    %91:vgpr_32, %93:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub0:sreg_64_xexec, %13.sub0:vreg_64_align2, 0, implicit $exec
+    %99:vgpr_32 = COPY %5.sub1:sreg_64_xexec
+    %92:vgpr_32, dead %94:sreg_64_xexec = V_ADDC_U32_e64 %99:vgpr_32, %13.sub1:vreg_64_align2, killed %93:sreg_64_xexec, 0, implicit $exec
+    %22:vgpr_32 = V_LSHLREV_B32_e64 2, %0:vgpr_32(s32), implicit $exec
+    %27:sreg_32 = S_MOV_B32 1024
+    %28:vgpr_32, %29:sreg_64_xexec = V_ADD_CO_U32_e64 %27:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %32:vgpr_32, %33:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %29:sreg_64_xexec, 0, implicit $exec
+    %35:vreg_64_align2 = REG_SEQUENCE killed %28:vgpr_32, %subreg.sub0, killed %32:vgpr_32, %subreg.sub1
+    %36:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %35:vreg_64_align2, 0, 0, implicit $exec
+    %50:sreg_32 = S_MOV_B32  4096
+    %51:vgpr_32, %52:sreg_64_xexec = V_ADD_CO_U32_e64 %50:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %53:vgpr_32, %54:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %52:sreg_64_xexec, 0, implicit $exec
+    %56:vreg_64_align2 = REG_SEQUENCE killed %51:vgpr_32, %subreg.sub0, killed %53:vgpr_32, %subreg.sub1
+    %57:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %56:vreg_64_align2, 0, 0, implicit $exec
+    %15:vreg_64_align2 = REG_SEQUENCE %91:vgpr_32, %subreg.sub0, %92:vgpr_32, %subreg.sub1
+    %16:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %15:vreg_64_align2, 0, 0, implicit $exec
+...
+---
+
+# GFX9-LABEL: name: offset0_base
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 2048, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 3072, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+
+name: offset0_base
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6
+    %3:sgpr_32 = COPY $sgpr6
+    %2:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2:sgpr_64(p4), 0, 0
+    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2:sgpr_64(p4), 8, 0
+    %8:vgpr_32 = V_LSHL_ADD_U32_e64 %3:sgpr_32, 8, %0:vgpr_32(s32), implicit $exec
+    %90:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %89:vreg_64_align2 = REG_SEQUENCE %8:vgpr_32, %subreg.sub0, killed %90:vgpr_32, %subreg.sub1
+    %13:vreg_64_align2 = V_LSHLREV_B64_e64 4, %89:vreg_64_align2, implicit $exec
+    %91:vgpr_32, %93:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub0:sreg_64_xexec, %13.sub0:vreg_64_align2, 0, implicit $exec
+    %99:vgpr_32 = COPY %5.sub1:sreg_64_xexec
+    %92:vgpr_32, dead %94:sreg_64_xexec = V_ADDC_U32_e64 %99:vgpr_32, %13.sub1:vreg_64_align2, killed %93:sreg_64_xexec, 0, implicit $exec
+    %22:vgpr_32 = V_LSHLREV_B32_e64 2, %0:vgpr_32(s32), implicit $exec
+    %27:sreg_32 = S_MOV_B32 2048
+    %28:vgpr_32, %29:sreg_64_xexec = V_ADD_CO_U32_e64 %27:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %32:vgpr_32, %33:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %29:sreg_64_xexec, 0, implicit $exec
+    %35:vreg_64_align2 = REG_SEQUENCE killed %28:vgpr_32, %subreg.sub0, killed %32:vgpr_32, %subreg.sub1
+    %36:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %35:vreg_64_align2, 0, 0, implicit $exec
+    %50:sreg_32 = S_MOV_B32  3072
+    %51:vgpr_32, %52:sreg_64_xexec = V_ADD_CO_U32_e64 %50:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %53:vgpr_32, %54:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %52:sreg_64_xexec, 0, implicit $exec
+    %56:vreg_64_align2 = REG_SEQUENCE killed %51:vgpr_32, %subreg.sub0, killed %53:vgpr_32, %subreg.sub1
+    %57:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %56:vreg_64_align2, 0, 0, implicit $exec
+    %15:vreg_64_align2 = REG_SEQUENCE %91:vgpr_32, %subreg.sub0, %92:vgpr_32, %subreg.sub1
+    %16:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %15:vreg_64_align2, 0, 0, implicit $exec
+...
+
+---
+
+# GFX9-LABEL: name: two_bases
+# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], -4096, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0
+# GFX9: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE %{{[0-9]+}}, %subreg.sub0, %{{[0-9]+}}, %subreg.sub1
+# GFX9: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], -4096, 0
+# GFX9: [[GLOBAL_LOAD_DWORDX4_3:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE5]], 0, 0
+
+name: two_bases
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6
+    %3:sgpr_32 = COPY $sgpr6
+    %2:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2:sgpr_64(p4), 0, 0
+    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2:sgpr_64(p4), 8, 0
+    %8:vgpr_32 = V_LSHL_ADD_U32_e64 %3:sgpr_32, 8, %0:vgpr_32(s32), implicit $exec
+    %90:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %89:vreg_64_align2 = REG_SEQUENCE %8:vgpr_32, %subreg.sub0, killed %90:vgpr_32, %subreg.sub1
+    %13:vreg_64_align2 = V_LSHLREV_B64_e64 4, %89:vreg_64_align2, implicit $exec
+    %91:vgpr_32, %93:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub0:sreg_64_xexec, %13.sub0:vreg_64_align2, 0, implicit $exec
+    %99:vgpr_32 = COPY %5.sub1:sreg_64_xexec
+    %92:vgpr_32, dead %94:sreg_64_xexec = V_ADDC_U32_e64 %99:vgpr_32, %13.sub1:vreg_64_align2, killed %93:sreg_64_xexec, 0, implicit $exec
+    %15:vreg_64_align2 = REG_SEQUENCE %91:vgpr_32, %subreg.sub0, %92:vgpr_32, %subreg.sub1
+    %16:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %15:vreg_64_align2, 0, 0, implicit $exec
+    %22:vgpr_32 = V_LSHLREV_B32_e64 2, %0:vgpr_32(s32), implicit $exec
+    %27:sreg_32 = S_MOV_B32 4096
+    %28:vgpr_32, %29:sreg_64_xexec = V_ADD_CO_U32_e64 %27:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %32:vgpr_32, %33:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %29:sreg_64_xexec, 0, implicit $exec
+    %35:vreg_64_align2 = REG_SEQUENCE killed %28:vgpr_32, %subreg.sub0, killed %32:vgpr_32, %subreg.sub1
+    %36:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %35:vreg_64_align2, 0, 0, implicit $exec
+    %50:sreg_32 = S_MOV_B32 8192
+    %51:vgpr_32, %52:sreg_64_xexec = V_ADD_CO_U32_e64 %50:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %53:vgpr_32, %54:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %52:sreg_64_xexec, 0, implicit $exec
+    %56:vreg_64_align2 = REG_SEQUENCE killed %51:vgpr_32, %subreg.sub0, killed %53:vgpr_32, %subreg.sub1
+    %57:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %56:vreg_64_align2, 0, 0, implicit $exec
+    %60:sreg_32 = S_MOV_B32 12288
+    %61:vgpr_32, %62:sreg_64_xexec = V_ADD_CO_U32_e64 %60:sreg_32, %91:vgpr_32, 0, implicit $exec
+    %63:vgpr_32, %64:sreg_64 = V_ADDC_U32_e64 0, %92:vgpr_32, killed %62:sreg_64_xexec, 0, implicit $exec
+    %66:vreg_64_align2 = REG_SEQUENCE killed %61:vgpr_32, %subreg.sub0, killed %63:vgpr_32, %subreg.sub1
+    %67:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 killed %66:vreg_64_align2, 0, 0, implicit $exec
+
+...



More information about the llvm-commits mailing list