[llvm] [AMDGPU] Infer Alignments after Inferring AddressSpaces (PR #69776)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 20 13:38:55 PDT 2023


https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/69776

Since ValueTracking / getOrEnforceAlignment gives up on AddrSpaceCasts, we may have suboptimal alignment in kernels that use them. InferAddressSpaces may clean these up, so running InferAlignments after may result in more optimal alignment. This enables certain LoadStore vectorizations.

A WIP while I investigate / work on the test changes, some of which seem a bit odd. For now, wondering if there are objections to the approach.


>From 80d47b1aa84cf3a0586b19aad9010d92e8c5dc8b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 19 Oct 2023 12:41:55 -0700
Subject: [PATCH 1/2] [InferAlignment] Support Legacy PM

Change-Id: Ic0ecf3a2a7e4e7b30877ed7b366d0d054a3d87cc
---
 llvm/include/llvm/InitializePasses.h          |  1 +
 .../llvm/Transforms/Scalar/InferAlignment.h   |  4 ++
 llvm/lib/Transforms/Scalar/InferAlignment.cpp | 40 +++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index db653fff71ba95a..e0ce0f4a7cc78df 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -141,6 +141,7 @@ void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry&);
 void initializeImplicitNullChecksPass(PassRegistry&);
 void initializeIndirectBrExpandPassPass(PassRegistry&);
 void initializeInferAddressSpacesPass(PassRegistry&);
+void initializeInferAligmentsPass(PassRegistry &);
 void initializeInstCountLegacyPassPass(PassRegistry &);
 void initializeInstSimplifyLegacyPassPass(PassRegistry &);
 void initializeInstructionCombiningPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Scalar/InferAlignment.h b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h
index f6fc5f3b8d9872f..85de257045c5a37 100644
--- a/llvm/include/llvm/Transforms/Scalar/InferAlignment.h
+++ b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h
@@ -18,10 +18,14 @@
 
 namespace llvm {
 
+class FunctionPass;
+
 struct InferAlignmentPass : public PassInfoMixin<InferAlignmentPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+FunctionPass *createInferAlignmentsPass();
+
 } // namespace llvm
 
 #endif // LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index b75b8d486fbbe8b..c45eef4b69f96f9 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -22,6 +22,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "infer-alignment"
+
 static bool tryToImproveAlign(
     const DataLayout &DL, Instruction *I,
     function_ref<Align(Value *PtrOp, Align OldAlign, Align PrefAlign)> Fn) {
@@ -89,3 +91,41 @@ PreservedAnalyses InferAlignmentPass::run(Function &F,
   // Changes to alignment shouldn't invalidated analyses.
   return PreservedAnalyses::all();
 }
+
+class InferAligments : public FunctionPass {
+
+public:
+  static char ID;
+
+  InferAligments() : FunctionPass(ID) {
+    initializeInferAligmentsPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+char InferAligments::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InferAligments, DEBUG_TYPE, "Infer Alignments", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(InferAligments, DEBUG_TYPE, "Infer Alignments", false,
+                    false)
+
+bool InferAligments::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  return inferAlignment(
+      F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), *DT);
+}
+
+FunctionPass *llvm::createInferAlignmentsPass() { return new InferAligments(); }
\ No newline at end of file

>From 5ecb285d6c3138dca544ed1e9813b03afdb2dbb2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 20 Oct 2023 12:43:24 -0700
Subject: [PATCH 2/2] [AMDGPU] Infer Alignments after Inferring AddressSpaces

Change-Id: I6f49893f15a9657107ede1bdf19fb156ca80db7e
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   7 +-
 .../AMDGPU/GlobalISel/function-returns.ll     |   2 +-
 ...licit-kernarg-backend-usage-global-isel.ll |  98 ++--
 .../GlobalISel/irtranslator-amdgpu_kernel.ll  | 123 +++--
 .../GlobalISel/irtranslator-call-sret.ll      |   4 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  10 +-
 .../GlobalISel/irtranslator-sibling-call.ll   |   8 +-
 .../abi-attribute-hints-undefined-behavior.ll |  37 +-
 .../branch-folding-implicit-def-subreg.ll     |  68 +--
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |  20 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 473 ++++++++++--------
 .../AMDGPU/implicit-kernarg-backend-usage.ll  | 101 ++--
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 102 +---
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |   6 +-
 14 files changed, 518 insertions(+), 541 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dc7321cd5de9fcd..0cd9de57903bc4f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -58,6 +58,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
@@ -1022,8 +1023,12 @@ void AMDGPUPassConfig::addIRPasses() {
   if (TM.getOptLevel() > CodeGenOptLevel::None)
     addPass(createAMDGPUAttributorPass());
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None)
+  // Alignment Inference may be blocked by addrpacecasts on pointer operands,
+  // so rerun after AddressSpaces have been inferred.
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
     addPass(createInferAddressSpacesPass());
+    addPass(createInferAlignmentsPass());
+  }
 
   // Run atomic optimizer before Atomic Expand
   if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index e6c835fa25406ad..94f6b1664407e17 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -953,7 +953,7 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = nuw G_PTR_ADD [[COPY]], [[C]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.arg0, addrspace 5)
+  ; CHECK-NEXT:   G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.arg0, align 4, addrspace 5)
   ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.gep1, addrspace 5)
   ; CHECK-NEXT:   SI_RETURN
   %val0 = load volatile i8, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index c25ecafa1f7c074..d9ccd1116d21621 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -386,24 +386,14 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V3:       ; %bb.0:
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V3-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8V3-NEXT:    s_add_u32 s0, s8, 8
-; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V3-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8V3-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8V3-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8V3-NEXT:    v_mov_b32_e32 v1, s11
-; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V3-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8V3-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8V3-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -412,24 +402,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
-; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V4-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V4-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8V4-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -438,22 +418,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V5-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V5-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V5-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8V5-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -462,50 +434,54 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V3:       ; %bb.0:
-; GFX9V3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V3-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9V3-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX9V3-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9V3-NEXT:    ; kill: killed $sgpr6_sgpr7
+; GFX9V3-NEXT:    ; kill: killed $sgpr8_sgpr9
 ; GFX9V3-NEXT:    ; kill: killed $sgpr4_sgpr5
-; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V3-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V3-NEXT:    s_endpgm
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V4-NEXT:    global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V4-NEXT:    global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V4-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V4-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V4-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V4-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9V4-NEXT:    ; kill: killed $sgpr6_sgpr7
+; GFX9V4-NEXT:    ; kill: killed $sgpr8_sgpr9
 ; GFX9V4-NEXT:    ; kill: killed $sgpr4_sgpr5
-; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    s_endpgm
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V5-NEXT:    global_load_ubyte v0, v[0:1], off glc
+; GFX9V5-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[6:7] offset:8 glc
-; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
-; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V5-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V5-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT:    ; kill: killed $sgpr6_sgpr7
 ; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
-; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index 6b2e6d8dfdb394c..637fac1cba3730a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -17,6 +17,7 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -51,6 +52,7 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -85,6 +87,7 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -119,6 +122,7 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -153,6 +157,7 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -187,6 +192,7 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -220,6 +226,7 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -252,6 +259,7 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -284,6 +292,7 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store (<2 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -316,6 +325,7 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store (<2 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -348,6 +358,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -380,6 +391,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -412,6 +424,7 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store (<3 x s8>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -444,6 +457,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store (<3 x s16>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -476,6 +490,7 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -508,6 +523,7 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -540,6 +556,7 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store (<4 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -572,6 +589,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store (<4 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -604,6 +622,7 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -636,6 +655,7 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -668,6 +688,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store (<8 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -700,6 +721,7 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store (<8 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -732,6 +754,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -764,6 +787,7 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -796,6 +820,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store (<16 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -828,6 +853,7 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store (<16 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -860,6 +886,7 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -892,6 +919,7 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -924,6 +952,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -955,6 +984,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double  %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -987,6 +1017,7 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store (s1) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1019,6 +1050,7 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1053,6 +1085,7 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1087,6 +1120,7 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1121,6 +1155,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1154,6 +1189,7 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1181,6 +1217,7 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: empty_array_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1226,12 +1263,13 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad,
   ; HSA-VI-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
   ; HSA-VI-NEXT:   [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD4]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD4]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1253,11 +1291,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad,
   ; LEGACY-MESA-VI-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
   ; LEGACY-MESA-VI-NEXT:   [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
   ; LEGACY-MESA-VI-NEXT:   [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD4]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD4]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; LEGACY-MESA-VI-NEXT:   S_ENDPGM 0
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
@@ -1293,12 +1331,13 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add
   ; HSA-VI-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
   ; HSA-VI-NEXT:   [[LOAD4:%[0-9]+]]:_(p1234) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; HSA-VI-NEXT:   G_STORE [[LOAD]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD1]](p1), [[C5]](p1) :: (volatile store (p1) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD3]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD4]](p1234), [[C5]](p1) :: (volatile store (p1234) into `ptr addrspace(1) null`, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD1]](p1), [[C5]](p1) :: (volatile store (p1) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD3]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD4]](p1234), [[C5]](p1) :: (volatile store (p1234) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1320,11 +1359,11 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add
   ; LEGACY-MESA-VI-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
   ; LEGACY-MESA-VI-NEXT:   [[LOAD4:%[0-9]+]]:_(p1234) = G_LOAD [[PTR_ADD4]](p4) :: (dereferenceable invariant load (s64), align 4, addrspace 4)
   ; LEGACY-MESA-VI-NEXT:   [[C5:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](p1), [[C5]](p1) :: (volatile store (p1) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD3]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD4]](p1234), [[C5]](p1) :: (volatile store (p1234) into `ptr addrspace(1) null`, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](p1), [[C5]](p1) :: (volatile store (p1) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s8), [[C5]](p1) :: (volatile store (s8) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD3]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD4]](p1234), [[C5]](p1) :: (volatile store (p1234) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; LEGACY-MESA-VI-NEXT:   S_ENDPGM 0
   %val0 = extractvalue {ptr addrspace(3), ptr addrspace(1)} %arg0, 0
   %val1 = extractvalue {ptr addrspace(3), ptr addrspace(1)} %arg0, 1
@@ -1359,11 +1398,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   ; HSA-VI-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
   ; HSA-VI-NEXT:   [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s64), align 1, addrspace 4)
   ; HSA-VI-NEXT:   [[C4:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment
   ; LEGACY-MESA-VI: bb.1 (%ir-block.1):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1382,10 +1422,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   ; LEGACY-MESA-VI-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64)
   ; LEGACY-MESA-VI-NEXT:   [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load (s64), align 1, addrspace 4)
   ; LEGACY-MESA-VI-NEXT:   [[C4:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD3]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD3]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; LEGACY-MESA-VI-NEXT:   S_ENDPGM 0
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1
@@ -1405,6 +1445,7 @@ define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %
   ; HSA-VI-NEXT: {{  $}}
   ; HSA-VI-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1431,6 +1472,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1466,6 +1508,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1504,6 +1547,7 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1545,6 +1589,7 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1582,10 +1627,11 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 260
   ; HSA-VI-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
-  ; HSA-VI-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
+  ; HSA-VI-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, align 256, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1599,7 +1645,7 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
   ; LEGACY-MESA-VI-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 296
   ; LEGACY-MESA-VI-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
   ; LEGACY-MESA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
-  ; LEGACY-MESA-VI-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
+  ; LEGACY-MESA-VI-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, align 256, addrspace 4)
   ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; LEGACY-MESA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; LEGACY-MESA-VI-NEXT:   S_ENDPGM 0
@@ -1627,6 +1673,7 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.1):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1666,6 +1713,7 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1700,6 +1748,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (dereferenceable load (s32) from %ir.in.byref)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1734,6 +1783,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 6)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1768,6 +1818,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (dereferenceable load (s32) from %ir.in.byref, addrspace 999)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1803,6 +1854,7 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out,
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (dereferenceable load (s32) from %ir.in.byref, addrspace 3)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1844,6 +1896,7 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1885,6 +1938,7 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref
   ; HSA-VI-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1913,6 +1967,7 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind {
   ; HSA-VI-NEXT:   [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
   ; HSA-VI-NEXT:   G_STORE [[C1]](s8), [[LOAD]](p3) :: (store (s8) into %ir.arg, align 4, addrspace 3)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: p3i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1936,8 +1991,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind {
   ; HSA-VI-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
   ; HSA-VI-NEXT:   [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0
-  ; HSA-VI-NEXT:   G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3)
+  ; HSA-VI-NEXT:   G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: p1i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1945,7 +2001,7 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind {
   ; LEGACY-MESA-VI-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
   ; LEGACY-MESA-VI-NEXT:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
   ; LEGACY-MESA-VI-NEXT:   [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0
-  ; LEGACY-MESA-VI-NEXT:   G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3)
+  ; LEGACY-MESA-VI-NEXT:   G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; LEGACY-MESA-VI-NEXT:   S_ENDPGM 0
   store i8 9, ptr addrspace(3) null
   ret void
@@ -1963,6 +2019,7 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1990,6 +2047,7 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -2023,6 +2081,7 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p
   ; HSA-VI-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x p3>), [[PTR_ADD2]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef` + 16, align 16, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
index 820e41de575e59c..2f9c8c225b2f0ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
@@ -26,7 +26,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ; GCN-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = nuw G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32)
-  ; GCN-NEXT:   G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store (s8) into %ir.in.val, addrspace 5)
+  ; GCN-NEXT:   G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store (s8) into %ir.in.val, align 4, addrspace 5)
   ; GCN-NEXT:   G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.in.gep1, addrspace 5)
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
@@ -69,7 +69,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ; GCN-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
   ; GCN-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = nuw G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32)
-  ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (dereferenceable load (s8) from %ir.out.val, addrspace 5)
+  ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (dereferenceable load (s8) from %ir.out.val, align 4, addrspace 5)
   ; GCN-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (dereferenceable load (s32) from %ir.out.gep1, addrspace 5)
   ; GCN-NEXT:   G_STORE [[LOAD]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   G_STORE [[LOAD1]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 2ede504223cb82f..830f95d62a1bf83 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -1021,7 +1021,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1390,7 +1390,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2p0() #0 {
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x p0>) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x p0>) from `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p0
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1455,7 +1455,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[DEF]](s64)
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<3 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i64
@@ -1525,7 +1525,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C2]](s64)
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 3)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i64
@@ -4056,7 +4056,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.val
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = nuw G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32)
-  ; CHECK-NEXT:   G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store (s8) into %ir.val, addrspace 5)
+  ; CHECK-NEXT:   G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store (s8) into %ir.val, align 4, addrspace 5)
   ; CHECK-NEXT:   G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.gep1, addrspace 5)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_byval_struct_i8_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index cc4796e534aba98..58e39880fdf275f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -921,13 +921,13 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; GCN-NEXT:   [[FRAME_INDEX34:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca0
   ; GCN-NEXT:   [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1.alloca1
-  ; GCN-NEXT:   G_STORE [[C]](s32), [[FRAME_INDEX34]](p5) :: (store (s32) into %ir.alloca0, addrspace 5)
+  ; GCN-NEXT:   G_STORE [[C]](s32), [[FRAME_INDEX34]](p5) :: (store (s32) into %ir.alloca0, align 16, addrspace 5)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C2]](s32)
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.alloca0 + 4, addrspace 5)
   ; GCN-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32)
-  ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca0 + 8, addrspace 5)
+  ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca0 + 8, align 8, addrspace 5)
   ; GCN-NEXT:   G_STORE [[C1]](s64), [[FRAME_INDEX35]](p5) :: (store (s64) into %ir.alloca1, addrspace 5)
   ; GCN-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX35]], [[C3]](s32)
   ; GCN-NEXT:   G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5)
@@ -1088,13 +1088,13 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GCN-NEXT:   [[FRAME_INDEX34:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
-  ; GCN-NEXT:   G_STORE [[C]](s32), [[FRAME_INDEX34]](p5) :: (store (s32) into %ir.alloca, addrspace 5)
+  ; GCN-NEXT:   G_STORE [[C]](s32), [[FRAME_INDEX34]](p5) :: (store (s32) into %ir.alloca, align 16, addrspace 5)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C2]](s32)
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.alloca + 4, addrspace 5)
   ; GCN-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
   ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32)
-  ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5)
+  ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, align 8, addrspace 5)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed
   ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]]
   ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]]
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index c7bc584d42d5ad3..54efe3c30f41689 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -207,22 +207,14 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_func_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s7
-; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s9
-; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s4
-; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s5
-; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-NEXT:    s_load_dword s6, s[8:9], 0x0
+; FIXEDABI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s11
 ; FIXEDABI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
   %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
   %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -238,15 +230,11 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
-; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-NEXT:    s_load_dword s0, s[4:5], 0x8
+; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; FIXEDABI-NEXT:    s_endpgm
   %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
   %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -262,9 +250,8 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
 define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
-; FIXEDABI-NEXT:    v_mov_b32_e32 v1, 0
-; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; FIXEDABI-NEXT:    s_mov_b64 s[0:1], 0
+; FIXEDABI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; FIXEDABI-NEXT:    s_endpgm
   %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5990736f664fbe8..ca0df9f2072a200 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -26,7 +26,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
@@ -149,8 +149,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
@@ -166,8 +166,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
@@ -183,8 +183,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
@@ -223,8 +223,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.18(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.18.Flow37:
@@ -240,8 +240,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.20(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.20.Flow38:
@@ -257,8 +257,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.22(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.22.Flow39:
@@ -274,8 +274,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.24(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.24.Flow40:
@@ -291,8 +291,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.26(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.26.Flow41:
@@ -308,8 +308,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.28(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.28.Flow42:
@@ -333,8 +333,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.31.Flow44:
@@ -359,8 +359,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.29(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.29
   ; GFX90A-NEXT: {{  $}}
@@ -739,7 +739,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
+  ; GFX90A-NEXT:   renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, align 8, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
@@ -773,7 +773,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec
@@ -829,7 +829,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
@@ -941,7 +941,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr3, renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
@@ -984,18 +984,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr29, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr29, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 2147483648, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 2147483648, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.71.Flow9:
   ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 2147483648, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.69
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index af023835c529776..34517b4c58f9f16 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -439,17 +439,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
-; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
-; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX900-NEXT:    s_endpgm
@@ -500,13 +496,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10_DEFAULT-NEXT:    s_clause 0x1
-; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
-; GFX10_DEFAULT-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(1)
-; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10_DEFAULT-NEXT:    s_clause 0x2
+; GFX10_DEFAULT-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
+; GFX10_DEFAULT-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f827a78125b7785..b5825f63cc2b95a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -3106,16 +3106,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_load_sbyte v2, v[1:2], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8 at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3137,18 +3139,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 2
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8 at abs32@hi
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ushort v0, v[3:4], off
+; GFX10-NEXT:    global_load_sbyte v2, v[5:6], off
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8 at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3171,18 +3176,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 2
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8 at abs32@hi
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    global_load_i8 v2, v[2:3], off
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8 at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
@@ -3203,18 +3211,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 2
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[3:4], off
+; GFX10-SCRATCH-NEXT:    global_load_sbyte v2, v[5:6], off
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8 at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3384,19 +3395,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_load_sbyte v4, v[1:2], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i8 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i8 at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_mov_b32_e32 v0, v5
-; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3424,15 +3436,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i8 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i8 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v1, 4
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    global_load_sbyte v4, v[1:2], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3460,15 +3473,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 4
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    global_load_i8 v4, v[1:2], off
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3496,15 +3510,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 4
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    global_load_sbyte v4, v[1:2], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
@@ -4298,39 +4313,43 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dword v0, v[41:42], off
+; GFX9-NEXT:    v_mov_b32_e32 v43, 2
+; GFX9-NEXT:    global_load_ushort v0, v[41:42], off
+; GFX9-NEXT:    v_mov_b32_e32 v44, 0
+; GFX9-NEXT:    global_load_sbyte v2, v[43:44], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_byte v[3:4], v2, off
+; GFX9-NEXT:    global_store_byte v[43:44], v2, off
 ; GFX9-NEXT:    global_store_short v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4341,41 +4360,46 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v42, 0
+; GFX10-NEXT:    v_mov_b32_e32 v43, 2
+; GFX10-NEXT:    v_mov_b32_e32 v44, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ushort v0, v[41:42], off
+; GFX10-NEXT:    global_load_sbyte v2, v[43:44], off
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dword v0, v[41:42], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x400
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, 2
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-NEXT:    global_store_byte v[3:4], v2, off
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    global_store_byte v[43:44], v2, off
 ; GFX10-NEXT:    global_store_short v[41:42], v0, off
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4386,43 +4410,47 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
+; GFX11-NEXT:    v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v43, 2
+; GFX11-NEXT:    v_mov_b32_e32 v44, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_u16 v0, v[41:42], off
+; GFX11-NEXT:    global_load_i8 v2, v[43:44], off
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[41:42], off
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
-; GFX11-NEXT:    global_store_b16 v[41:42], v3, off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    global_store_b8 v[43:44], v2, off
+; GFX11-NEXT:    global_store_b16 v[41:42], v0, off
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4433,41 +4461,46 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 2
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v44, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[41:42], off
+; GFX10-SCRATCH-NEXT:    global_load_sbyte v2, v[43:44], off
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[41:42], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
+; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    global_store_byte v[43:44], v2, off
 ; GFX10-SCRATCH-NEXT:    global_store_short v[41:42], v0, off
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    s_clause 0x3
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4680,45 +4713,48 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
+; GFX9-NEXT:    v_mov_b32_e32 v43, 4
+; GFX9-NEXT:    global_load_dword v0, v[41:42], off
+; GFX9-NEXT:    v_mov_b32_e32 v44, 0
+; GFX9-NEXT:    global_load_sbyte v4, v[43:44], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret at abs32@lo
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_mov_b32_e32 v0, v5
-; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v0, 4
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_byte v[0:1], v4, off
-; GFX9-NEXT:    global_store_dword v[41:42], v2, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    global_store_byte v[43:44], v4, off
+; GFX9-NEXT:    global_store_dword v[41:42], v0, off
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4729,47 +4765,50 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v42, 0
+; GFX10-NEXT:    v_mov_b32_e32 v43, 4
+; GFX10-NEXT:    v_mov_b32_e32 v44, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret at abs32@hi
+; GFX10-NEXT:    global_load_dword v0, v[41:42], off
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
+; GFX10-NEXT:    global_load_sbyte v4, v[43:44], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x400
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v0, 4
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_byte v[0:1], v4, off
-; GFX10-NEXT:    global_store_dword v[41:42], v2, off
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    global_store_byte v[43:44], v4, off
+; GFX10-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4780,28 +4819,30 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
+; GFX11-NEXT:    v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v43, 4
+; GFX11-NEXT:    v_mov_b32_e32 v44, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
+; GFX11-NEXT:    global_load_b32 v0, v[41:42], off
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[5:6], v[41:42], off
+; GFX11-NEXT:    global_load_i8 v4, v[43:44], off
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
@@ -4812,21 +4853,22 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
-; GFX11-NEXT:    global_store_b32 v[41:42], v2, off
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    global_store_b8 v[43:44], v4, off
+; GFX11-NEXT:    global_store_b32 v[41:42], v0, off
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4837,47 +4879,50 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 4
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v44, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
+; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[41:42], off
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
+; GFX10-SCRATCH-NEXT:    global_load_sbyte v4, v[43:44], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v4, off
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v2, off
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    global_store_byte v[43:44], v4, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    s_clause 0x3
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 9760e93eb48e6bd..a8499f6e0a4cc5a 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -375,71 +375,47 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V3:       ; %bb.0:
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V3-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8V3-NEXT:    s_add_u32 s0, s8, 8
-; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V3-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8V3-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V3-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8V3-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8V3-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8V3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8V3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V3-NEXT:    s_endpgm
 ;
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
-; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V4-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V4-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8V4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V4-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8V4-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    s_endpgm
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8V5-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V5-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V5-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V5-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8V5-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
@@ -448,51 +424,54 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V3:       ; %bb.0:
-; GFX9V3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V3-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9V3-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9V3-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9V3-NEXT:    ; kill: killed $sgpr6_sgpr7
+; GFX9V3-NEXT:    ; kill: killed $sgpr8_sgpr9
 ; GFX9V3-NEXT:    ; kill: killed $sgpr4_sgpr5
-; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V3-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V3-NEXT:    s_endpgm
 ;
 ; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V4-NEXT:    global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V4-NEXT:    global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V4-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V4-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V4-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V4-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9V4-NEXT:    ; kill: killed $sgpr6_sgpr7
+; GFX9V4-NEXT:    ; kill: killed $sgpr8_sgpr9
 ; GFX9V4-NEXT:    ; kill: killed $sgpr4_sgpr5
-; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    s_endpgm
 ;
 ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[0:1] glc
-; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[6:7] offset:8 glc
-; GFX9V5-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
-; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V5-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V5-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9V5-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX9V5-NEXT:    ; kill: killed $sgpr6_sgpr7
 ; GFX9V5-NEXT:    ; kill: killed $sgpr4_sgpr5
-; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 0e6c1aecb6774a3..af8d2596851d420 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v9, s7
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s6
-; CHECK-NEXT:    flat_load_ubyte v10, v[8:9] offset:5
-; CHECK-NEXT:    flat_load_ubyte v11, v[8:9] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[8:9] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[8:9] offset:3
-; CHECK-NEXT:    flat_load_ubyte v14, v[8:9] offset:2
-; CHECK-NEXT:    flat_load_ubyte v15, v[8:9] offset:1
-; CHECK-NEXT:    flat_load_ubyte v16, v[8:9]
-; CHECK-NEXT:    flat_load_ubyte v17, v[8:9] offset:4
-; CHECK-NEXT:    flat_load_ubyte v18, v[8:9] offset:13
-; CHECK-NEXT:    flat_load_ubyte v19, v[8:9] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[8:9] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[8:9] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[8:9] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[8:9] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[8:9] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[8:9] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s6, v6
+; CHECK-NEXT:    v_mov_b32_e32 v13, s7
+; CHECK-NEXT:    v_add_co_u32_e32 v12, vcc, s6, v6
 ; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[4:5], 2
-; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
+; CHECK-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v7, vcc
 ; CHECK-NEXT:    s_add_u32 s6, s6, 16
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[8:9], v13 offset:3
-; CHECK-NEXT:    flat_store_byte v[8:9], v14 offset:2
-; CHECK-NEXT:    flat_store_byte v[8:9], v15 offset:1
-; CHECK-NEXT:    flat_store_byte v[8:9], v16
-; CHECK-NEXT:    flat_store_byte v[8:9], v12 offset:7
-; CHECK-NEXT:    flat_store_byte v[8:9], v11 offset:6
-; CHECK-NEXT:    flat_store_byte v[8:9], v10 offset:5
-; CHECK-NEXT:    flat_store_byte v[8:9], v17 offset:4
-; CHECK-NEXT:    flat_store_byte v[8:9], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[8:9], v22 offset:10
-; CHECK-NEXT:    flat_store_byte v[8:9], v23 offset:9
-; CHECK-NEXT:    flat_store_byte v[8:9], v24 offset:8
-; CHECK-NEXT:    flat_store_byte v[8:9], v20 offset:15
-; CHECK-NEXT:    flat_store_byte v[8:9], v19 offset:14
-; CHECK-NEXT:    flat_store_byte v[8:9], v18 offset:13
-; CHECK-NEXT:    flat_store_byte v[8:9], v25 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
 ; CHECK-NEXT:    s_and_b32 s4, 32, 15
@@ -127,47 +98,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, s10
 ; CHECK-NEXT:    v_mov_b32_e32 v11, s11
-; CHECK-NEXT:    flat_load_ubyte v12, v[10:11] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[10:11] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[10:11] offset:7
-; CHECK-NEXT:    flat_load_ubyte v15, v[10:11] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[10:11] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[10:11] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[10:11]
-; CHECK-NEXT:    flat_load_ubyte v19, v[10:11] offset:4
-; CHECK-NEXT:    flat_load_ubyte v20, v[10:11] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[10:11] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[10:11] offset:15
-; CHECK-NEXT:    flat_load_ubyte v23, v[10:11] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[10:11] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[10:11] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[10:11] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[10:11] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
+; CHECK-NEXT:    v_mov_b32_e32 v15, s11
 ; CHECK-NEXT:    s_add_u32 s14, s14, 1
-; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s10, v2
-; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v14, vcc, s10, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
 ; CHECK-NEXT:    s_addc_u32 s15, s15, 0
 ; CHECK-NEXT:    s_add_u32 s10, s10, 16
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5]
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[10:11], v15 offset:3
-; CHECK-NEXT:    flat_store_byte v[10:11], v16 offset:2
-; CHECK-NEXT:    flat_store_byte v[10:11], v17 offset:1
-; CHECK-NEXT:    flat_store_byte v[10:11], v18
-; CHECK-NEXT:    flat_store_byte v[10:11], v14 offset:7
-; CHECK-NEXT:    flat_store_byte v[10:11], v13 offset:6
-; CHECK-NEXT:    flat_store_byte v[10:11], v12 offset:5
-; CHECK-NEXT:    flat_store_byte v[10:11], v19 offset:4
-; CHECK-NEXT:    flat_store_byte v[10:11], v23 offset:11
-; CHECK-NEXT:    flat_store_byte v[10:11], v24 offset:10
-; CHECK-NEXT:    flat_store_byte v[10:11], v25 offset:9
-; CHECK-NEXT:    flat_store_byte v[10:11], v26 offset:8
-; CHECK-NEXT:    flat_store_byte v[10:11], v22 offset:15
-; CHECK-NEXT:    flat_store_byte v[10:11], v21 offset:14
-; CHECK-NEXT:    flat_store_byte v[10:11], v20 offset:13
-; CHECK-NEXT:    flat_store_byte v[10:11], v27 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_14
 ; CHECK-NEXT:  .LBB0_15: ; %Flow20
@@ -251,22 +193,14 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:  .LBB1_8: ; %post-loop-memcpy-expansion
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:12
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:6
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:8
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:14
+; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:12
 ; CHECK-NEXT:  .LBB1_9: ; %loop-memcpy-expansion2
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_9
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad69638d..56ed4f1014098e1 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -479,7 +479,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
   ; SI-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
+  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.7
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.if.then9:
@@ -496,7 +496,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT:   successors: %bb.6(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s8) from `ptr addrspace(1) null`, addrspace 1)
+  ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s8) from `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.6
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5.Flow:
@@ -511,7 +511,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
+  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, align 4294967296, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7.UnifiedReturnBlock:



More information about the llvm-commits mailing list