[llvm] [AMDGPU][GlobalISel] Enable kernel argument preloading (PR #134655)

Tim Gymnich via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 10 09:57:31 PDT 2025


https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/134655

>From fef2d0fad6439e9a29d7298ca721028552408bd1 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Apr 2025 11:39:52 +0000
Subject: [PATCH 1/2] [AMDGPU][GlobalISel] Enable kernel argument preloading

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  96 +++++++++--
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h   |   4 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   9 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 156 +++++++++++-------
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   1 -
 5 files changed, 185 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a15f193549936..1e09d70c5dc56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,8 +20,10 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/MC/MCRegister.h"
 
 #define DEBUG_TYPE "amdgpu-call-lowering"
 
@@ -497,6 +499,65 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
+void AMDGPUCallLowering::lowerPreloadedParameter(
+    MachineIRBuilder &B, ArrayRef<Register> VRegs, Type *ArgTy,
+    uint64_t ArgOffset, Align Alignment,
+    ArrayRef<MCRegister> PreloadRegs) const {
+  MachineFunction &MF = B.getMF();
+  const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const DataLayout &DL = B.getDataLayout();
+
+  LLT ResTy = getLLTForType(*ArgTy, DL);
+  LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy));
+  unsigned TotalSize = 0;
+  SmallVector<Register> SrcRegs(PreloadRegs.size());
+
+  for (auto &&[Idx, PhysReg] : enumerate(PreloadRegs)) {
+    Register VReg = MRI.getLiveInVirtReg(PhysReg);
+    TypeSize RegSize = TRI->getRegSizeInBits(VReg, MRI);
+
+    if (!MRI.getVRegDef(VReg)) {
+      MRI.setType(VReg, LLT::scalar(RegSize));
+      B.getMBB().addLiveIn(PhysReg);
+      B.buildInstr(TargetOpcode::COPY).addDef(VReg).addReg(PhysReg);
+    }
+
+    if (DL.getTypeStoreSize(ArgTy) < 4 && Alignment < 4) {
+      int64_t AlignDownOffset = alignDown(ArgOffset, 4);
+      int64_t OffsetDiff = ArgOffset - AlignDownOffset;
+      auto ShiftAmt = B.buildConstant(LLT::scalar(32), OffsetDiff * 8);
+      auto Shift = B.buildRotateLeft(LLT::scalar(RegSize), VReg, ShiftAmt);
+
+      if (ResTy.isVector()) {
+        B.buildBitcast(VRegs[0], B.buildTrunc(ScalarTy, Shift));
+      } else {
+        B.buildTrunc(VRegs[0], Shift);
+      }
+
+      return;
+    }
+
+    TotalSize += RegSize;
+    SrcRegs[Idx] = VReg;
+  }
+
+  LLT MergeTy = LLT::scalar(TotalSize);
+  Register Res = SrcRegs.back();
+
+  if (SrcRegs.size() > 1)
+    Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0);
+
+  if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits())
+    Res = B.buildTrunc(ScalarTy, Res).getReg(0);
+
+  if (ResTy.isVector())
+    Res = B.buildBitcast(ResTy, Res).getReg(0);
+
+  MRI.replaceRegWith(Res, VRegs[0]);
+}
+
 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
     MachineIRBuilder &B, const Function &F,
     ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -513,6 +574,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
 
+  if (Subtarget->hasKernargPreload())
+    TLI.allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
+
   unsigned i = 0;
   const Align KernArgBaseAlign(16);
   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
@@ -520,12 +584,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
   // TODO: Align down to dword alignment and extract bits for extending loads.
   for (auto &Arg : F.args()) {
-    // TODO: Add support for kernarg preload.
-    if (Arg.hasAttribute("amdgpu-hidden-argument")) {
-      LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
-      return false;
-    }
-
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
@@ -559,13 +617,29 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
         B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
       }
-    } else {
-      ArgInfo OrigArg(VRegs[i], Arg, i);
-      const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
-      setArgFlags(OrigArg, OrigArgIdx, DL, F);
-      lowerParameter(B, OrigArg, ArgOffset, Alignment);
+      i++;
+      continue;
     }
 
+    auto &PreloadKernArgs = Info->getArgInfo().PreloadKernArgs;
+    auto PreloadKernArg =
+        Arg.hasInRegAttr() ? PreloadKernArgs.find(i) : PreloadKernArgs.end();
+    if (PreloadKernArg != PreloadKernArgs.end()) {
+      lowerPreloadedParameter(B, VRegs[i], ArgTy, ArgOffset, Alignment,
+                              PreloadKernArg->getSecond().Regs);
+      ++i;
+      continue;
+    }
+
+    if (Arg.hasAttribute("amdgpu-hidden-argument"))
+      F.getContext().diagnose(DiagnosticInfoUnsupported(
+          F, "hidden argument in kernel signature was not preloaded",
+          B.getDL()));
+
+    ArgInfo OrigArg(VRegs[i], Arg, i);
+    const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
+    setArgFlags(OrigArg, OrigArgIdx, DL, F);
+    lowerParameter(B, OrigArg, ArgOffset, Alignment);
     ++i;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..74fce411e2851 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -30,6 +30,10 @@ class AMDGPUCallLowering final : public CallLowering {
   void lowerParameter(MachineIRBuilder &B, ArgInfo &AI, uint64_t Offset,
                       Align Alignment) const;
 
+  void lowerPreloadedParameter(MachineIRBuilder &B, ArrayRef<Register> VRegs,
+                               Type *ArgTy, uint64_t ArgOffset, Align Alignment,
+                               ArrayRef<MCRegister> PreloadRegs) const;
+
   bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
                       SmallVectorImpl<BaseArgInfo> &Outs,
                       bool IsVarArg) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 533ad349f7500..653f18062405c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1207,8 +1207,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
   uint64_t ExplicitArgOffset = 0;
   const DataLayout &DL = Fn.getDataLayout();
 
-  unsigned InIndex = 0;
-
   for (const Argument &Arg : Fn.args()) {
     const bool IsByRef = Arg.hasByRefAttr();
     Type *BaseArgTy = Arg.getType();
@@ -1297,10 +1295,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
       unsigned PartOffset = 0;
       for (unsigned i = 0; i != NumRegs; ++i) {
-        State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
-                                               BasePartOffset + PartOffset,
-                                               MemVT.getSimpleVT(),
-                                               CCValAssign::Full));
+        State.addLoc(CCValAssign::getCustomMem(
+            Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset,
+            MemVT.getSimpleVT(), CCValAssign::Full));
         PartOffset += MemVT.getStoreSize();
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da95672..15505f3124cec 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2536,84 +2536,114 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
+static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
+                                uint64_t ArgOffset, unsigned ArgSize,
+                                unsigned Idx, MachineFunction &MF,
+                                const SIRegisterInfo &TRI,
+                                SIMachineFunctionInfo &Info, CCState &CCInfo) {
+  GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+  const Align KernelArgBaseAlign = Align(16);
+  Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+  unsigned NumAllocSGPRs = alignTo(ArgSize, 4) / 4;
+
+  // Arg is preloaded into the previous SGPR.
+  if (ArgSize < 4 && Alignment < 4) {
+    assert(Idx >= 1 && "No previous SGPR");
+    AMDGPUFunctionArgInfo &ArgInfo = Info.getArgInfo();
+    auto &ArgDesc = ArgInfo.PreloadKernArgs[Idx];
+    auto &PrevArgDesc = ArgInfo.PreloadKernArgs[Idx - 1];
+    ArgDesc.Regs.push_back(PrevArgDesc.Regs[0]);
+    return true;
+  }
+
+  unsigned Padding = ArgOffset - LastExplicitArgOffset;
+  unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+  // Check for free user SGPRs for preloading.
+  if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs())
+    return false;
+
+  // Preload this argument.
+  const TargetRegisterClass *RC =
+      TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+  SmallVectorImpl<MCRegister> *PreloadRegs =
+      Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
+
+  if (PreloadRegs->size() > 1)
+    RC = &AMDGPU::SGPR_32RegClass;
+
+  for (MCRegister Reg : *PreloadRegs) {
+    assert(Reg);
+    MF.addLiveIn(Reg, RC);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+  return true;
+}
+
 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
 // sequential starting from the first argument.
 void SITargetLowering::allocatePreloadKernArgSGPRs(
-    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
-    const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
     const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
   Function &F = MF.getFunction();
-  unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
-  GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
-  bool InPreloadSequence = true;
-  unsigned InIdx = 0;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
+  uint64_t ExplicitArgOffset = BaseOffset;
+  uint64_t LastExplicitArgOffset = ExplicitArgOffset;
+  unsigned LocIdx = 0;
   bool AlignedForImplictArgs = false;
   unsigned ImplicitArgOffset = 0;
+
   for (auto &Arg : F.args()) {
-    if (!InPreloadSequence || !Arg.hasInRegAttr())
+    if (!Arg.hasInRegAttr())
       break;
 
-    unsigned ArgIdx = Arg.getArgNo();
-    // Don't preload non-original args or parts not in the current preload
-    // sequence.
-    if (InIdx < Ins.size() &&
-        (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
+    const bool IsByRef = Arg.hasByRefAttr();
+    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+    if (AllocSize == 0)
       break;
 
-    for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
-           Ins[InIdx].getOrigArgIndex() == ArgIdx;
-         InIdx++) {
-      assert(ArgLocs[ArgIdx].isMemLoc());
-      auto &ArgLoc = ArgLocs[InIdx];
-      const Align KernelArgBaseAlign = Align(16);
-      unsigned ArgOffset = ArgLoc.getLocMemOffset();
-      Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
-      unsigned NumAllocSGPRs =
-          alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
-
-      // Fix alignment for hidden arguments.
-      if (Arg.hasAttribute("amdgpu-hidden-argument")) {
-        if (!AlignedForImplictArgs) {
-          ImplicitArgOffset =
-              alignTo(LastExplicitArgOffset,
-                      Subtarget->getAlignmentForImplicitArgPtr()) -
-              LastExplicitArgOffset;
-          AlignedForImplictArgs = true;
-        }
-        ArgOffset += ImplicitArgOffset;
-      }
+    MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
+    Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
 
-      // Arg is preloaded into the previous SGPR.
-      if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
-        assert(InIdx >= 1 && "No previous SGPR");
-        Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
-            Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
-        continue;
-      }
+    // Fix alignment for hidden arguments.
+    if (Arg.hasAttribute("amdgpu-hidden-argument") && !AlignedForImplictArgs) {
+      ImplicitArgOffset = alignTo(LastExplicitArgOffset,
+                                  Subtarget->getAlignmentForImplicitArgPtr()) -
+                          LastExplicitArgOffset;
+      AlignedForImplictArgs = true;
+    }
 
-      unsigned Padding = ArgOffset - LastExplicitArgOffset;
-      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
-      // Check for free user SGPRs for preloading.
-      if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
-        InPreloadSequence = false;
-        break;
-      }
+    uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
 
-      // Preload this argument.
-      const TargetRegisterClass *RC =
-          TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
-      SmallVectorImpl<MCRegister> *PreloadRegs =
-          Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
-
-      if (PreloadRegs->size() > 1)
-        RC = &AMDGPU::SGPR_32RegClass;
-      for (auto &Reg : *PreloadRegs) {
-        assert(Reg);
-        MF.addLiveIn(Reg, RC);
-        CCInfo.AllocateReg(Reg);
-      }
+    if (ArgLocs.empty()) {
+      // global isel
+      if (Arg.hasAttribute("amdgpu-hidden-argument"))
+        ArgOffset += ImplicitArgOffset;
 
-      LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+      if (!allocPreloadKernArg(LastExplicitArgOffset, ArgOffset, AllocSize,
+                               Arg.getArgNo(), MF, TRI, Info, CCInfo))
+        return; // no more available sgprs
+    } else {
+      // DAG isel
+      for (; LocIdx < ArgLocs.size() &&
+             ArgLocs[LocIdx].getValNo() == Arg.getArgNo();
+           LocIdx++) {
+        CCValAssign &ArgLoc = ArgLocs[LocIdx];
+        assert(ArgLoc.isMemLoc());
+        uint64_t LocOffset = ArgLoc.getLocMemOffset();
+        unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
+        if (Arg.hasAttribute("amdgpu-hidden-argument"))
+          LocOffset += ImplicitArgOffset;
+
+        if (!allocPreloadKernArg(LastExplicitArgOffset, LocOffset, LocSize,
+                                 LocIdx, MF, TRI, Info, CCInfo))
+          return; // no more available sgprs
+      }
     }
   }
 }
@@ -2935,7 +2965,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
     if (IsKernel && Subtarget->hasKernargPreload())
-      allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+      allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
 
     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
   } else if (!IsGraphics) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dc0634331caf9..1efab6598dc5b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -576,7 +576,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   void allocatePreloadKernArgSGPRs(CCState &CCInfo,
                                    SmallVectorImpl<CCValAssign> &ArgLocs,
-                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                    MachineFunction &MF,
                                    const SIRegisterInfo &TRI,
                                    SIMachineFunctionInfo &Info) const;

>From a01f97aa4cc67b2241212bd55b10e20d21b8131b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 10 Apr 2025 16:13:29 +0000
Subject: [PATCH 2/2] update tests

---
 .../amdhsa-kernarg-preload-num-sgprs.ll       |    6 +-
 ...alid-hidden-kernarg-in-kernel-signature.ll |   10 +-
 .../AMDGPU/preload-implicit-kernargs.ll       | 1971 ++++++----
 .../CodeGen/AMDGPU/preload-kernarg-header.ll  |   43 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  | 3296 +++++++++++------
 5 files changed, 3600 insertions(+), 1726 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..17d23c4746f69 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM %s
 
 ; OBJDUMP: Contents of section .rodata:
 ; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000  ................
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll b/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll
index 8344cf9265397..ed5791ec20da7 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll
@@ -1,17 +1,15 @@
-; RUN: not llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
-; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -global-isel=1 -global-isel-abort=2 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
-; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
+; RUN: not llc -global-isel=1 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
+; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
 
 define amdgpu_kernel void @no_free_sgprs_block_count_x_no_preload_diag(ptr addrspace(1) inreg %out, i512 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
-; GISEL: warning: Instruction selection used fallback path for no_free_sgprs_block_count_x_no_preload_diag
 ; ERROR: error: <unknown>:0:0: in function no_free_sgprs_block_count_x_no_preload_diag void (ptr addrspace(1), i512, i32): hidden argument in kernel signature was not preloaded
   store i32 %_hidden_block_count_x, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @preloadremainder_z_no_preload_diag(ptr addrspace(1) inreg %out, i256 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
-; GISEL: warning: Instruction selection used fallback path for preloadremainder_z_no_preload_diag
 ; ERROR: error: <unknown>:0:0: in function preloadremainder_z_no_preload_diag void (ptr addrspace(1), i256, i32, i32, i32, i16, i16, i16, i16, i16, i16): hidden argument in kernel signature was not preloaded
   %conv = zext i16 %_hidden_remainder_z to i32
   store i32 %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 00507c1eafd6e..537021f20c73d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,35 +1,65 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a,GFX90a-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a,GFX90a-GISEL %s
 
 define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_x:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB0_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB0_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_x:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB0_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB0_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_count_x:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB0_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB0_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_x:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB0_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB0_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_x:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB0_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB0_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_x:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB0_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB0_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -37,35 +67,65 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
 }
 
 define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 {
-; GFX942-LABEL: preload_unused_arg_block_count_x:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB1_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB1_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s6
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_unused_arg_block_count_x:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB1_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB1_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_unused_arg_block_count_x:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB1_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB1_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_unused_arg_block_count_x:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB1_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB1_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_unused_arg_block_count_x:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB1_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB1_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_unused_arg_block_count_x:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB1_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB1_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -73,35 +133,65 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
 }
 
 define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) {
-; GFX942-LABEL: no_free_sgprs_block_count_x:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB2_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB2_0:
-; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x28
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: no_free_sgprs_block_count_x:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB2_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB2_0:
+; GFX942-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: no_free_sgprs_block_count_x:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx4 s[12:15], s[8:9], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB2_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB2_0:
-; GFX90a-NEXT:    s_load_dword s0, s[8:9], 0x28
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[12:13]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: no_free_sgprs_block_count_x:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[12:15], s[8:9], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB2_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB2_0:
+; GFX90a-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x28
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[12:13]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: no_free_sgprs_block_count_x:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB2_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB2_0:
+; GFX942-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: no_free_sgprs_block_count_x:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB2_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB2_0:
+; GFX90a-GISEL-NEXT:    s_load_dword s0, s[8:9], 0x28
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[12:13]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -109,25 +199,45 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
 }
 
 define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
-; GFX942-LABEL: no_inreg_block_count_x:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: no_inreg_block_count_x:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: no_inreg_block_count_x:
-; GFX90a:       ; %bb.0:
-; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: no_inreg_block_count_x:
+; GFX90a-SDAG:       ; %bb.0:
+; GFX90a-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: no_inreg_block_count_x:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: no_inreg_block_count_x:
+; GFX90a-GISEL:       ; %bb.0:
+; GFX90a-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -138,25 +248,45 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
 ; args are inreg (preloaded).
 
 define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 {
-; GFX942-LABEL: mixed_inreg_block_count_x:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x10
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: mixed_inreg_block_count_x:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: mixed_inreg_block_count_x:
-; GFX90a:       ; %bb.0:
-; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: mixed_inreg_block_count_x:
+; GFX90a-SDAG:       ; %bb.0:
+; GFX90a-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: mixed_inreg_block_count_x:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: mixed_inreg_block_count_x:
+; GFX90a-GISEL:       ; %bb.0:
+; GFX90a-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -200,35 +330,65 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
 }
 
 define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: incorrect_type_i16_block_count_x:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB6_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB6_0:
-; GFX942-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: incorrect_type_i16_block_count_x:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB6_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB6_0:
+; GFX942-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: incorrect_type_i16_block_count_x:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB6_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB6_0:
-; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: incorrect_type_i16_block_count_x:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB6_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB6_0:
+; GFX90a-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: incorrect_type_i16_block_count_x:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB6_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB6_0:
+; GFX942-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: incorrect_type_i16_block_count_x:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB6_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB6_0:
+; GFX90a-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i16, ptr addrspace(4) %imp_arg_ptr
   store i16 %load, ptr addrspace(1) %out
@@ -236,33 +396,61 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
 }
 
 define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_y:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB7_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB7_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_y:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB7_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB7_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_count_y:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB7_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB7_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_y:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB7_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB7_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_y:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB7_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB7_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_y:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB7_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB7_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
   %load = load i32, ptr addrspace(4) %gep
@@ -271,37 +459,67 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
 }
 
 define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: random_incorrect_offset:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB8_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB8_0:
-; GFX942-NEXT:    s_mov_b32 s4, 8
-; GFX942-NEXT:    s_load_dword s0, s[0:1], s4 offset:0x2
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: random_incorrect_offset:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB8_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB8_0:
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, 8
+; GFX942-SDAG-NEXT:    s_load_dword s0, s[0:1], s4 offset:0x2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: random_incorrect_offset:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB8_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB8_0:
-; GFX90a-NEXT:    s_mov_b32 s0, 8
-; GFX90a-NEXT:    s_load_dword s0, s[4:5], s0 offset:0x2
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: random_incorrect_offset:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB8_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB8_0:
+; GFX90a-SDAG-NEXT:    s_mov_b32 s0, 8
+; GFX90a-SDAG-NEXT:    s_load_dword s0, s[4:5], s0 offset:0x2
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: random_incorrect_offset:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB8_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB8_0:
+; GFX942-GISEL-NEXT:    s_load_dword s0, s[0:1], 0xa
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: random_incorrect_offset:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB8_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB8_0:
+; GFX90a-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xa
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
   %load = load i32, ptr addrspace(4) %gep
@@ -310,35 +528,65 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 }
 
 define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_z:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB9_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB9_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s6
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_z:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB9_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB9_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_count_z:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB9_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB9_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_z:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB9_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB9_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_z:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB9_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB9_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_z:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB9_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB9_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
   %load = load i32, ptr addrspace(4) %gep
@@ -347,39 +595,73 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
 }
 
 define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 {
-; GFX942-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB10_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB10_0:
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX942-NEXT:    s_add_i32 s0, s6, s0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB10_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB10_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT:    s_add_i32 s0, s6, s0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB10_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB10_0:
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT:    s_add_i32 s0, s10, s0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB10_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB10_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT:    s_add_i32 s0, s10, s0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB10_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB10_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT:    s_add_i32 s0, s6, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB10_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB10_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT:    s_add_i32 s0, s10, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   %ext = zext i8 %val to i32
@@ -389,39 +671,73 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
 }
 
 define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_xyz:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB11_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB11_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    v_mov_b32_e32 v2, s6
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_xyz:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB11_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB11_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_count_xyz:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB11_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB11_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_xyz:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB11_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB11_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_xyz:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB11_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB11_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_xyz:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB11_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB11_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
   %load_x = load i32, ptr addrspace(4) %gep_x
@@ -437,35 +753,65 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
 }
 
 define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_x:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB12_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB12_0:
-; GFX942-NEXT:    s_and_b32 s0, s7, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_x:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB12_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB12_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_workgroup_size_x:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB12_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB12_0:
-; GFX90a-NEXT:    s_and_b32 s0, s11, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_x:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB12_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB12_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_x:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB12_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB12_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_x:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB12_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB12_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
   %load = load i16, ptr addrspace(4) %gep
@@ -475,35 +821,65 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
 }
 
 define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_y:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB13_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB13_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_y:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB13_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB13_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_workgroup_size_y:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB13_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB13_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_y:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB13_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB13_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_y:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB13_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB13_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s7, s7, 16
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_y:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB13_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB13_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s11, s11, 16
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
   %load = load i16, ptr addrspace(4) %gep
@@ -513,37 +889,69 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
 }
 
 define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_z:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s8, s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB14_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB14_0:
-; GFX942-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_z:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB14_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB14_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_workgroup_size_z:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB14_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB14_0:
-; GFX90a-NEXT:    s_and_b32 s0, s12, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_z:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB14_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB14_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s12, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_z:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB14_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB14_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_z:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB14_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB14_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s12, 0xffff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
   %load = load i16, ptr addrspace(4) %gep
@@ -553,45 +961,85 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
 }
 
 define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_xyz:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s8, s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB15_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB15_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
-; GFX942-NEXT:    s_and_b32 s1, s7, 0xffff
-; GFX942-NEXT:    s_and_b32 s4, s8, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s1
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    v_mov_b32_e32 v2, s4
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_xyz:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB15_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB15_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX942-SDAG-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX942-SDAG-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_workgroup_size_xyz:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB15_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB15_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT:    s_and_b32 s1, s11, 0xffff
-; GFX90a-NEXT:    s_and_b32 s2, s12, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_xyz:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB15_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB15_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-SDAG-NEXT:    s_and_b32 s1, s11, 0xffff
+; GFX90a-SDAG-NEXT:    s_and_b32 s2, s12, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_xyz:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB15_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB15_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s7, s7, 16
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX942-GISEL-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_xyz:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB15_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB15_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s11, s11, 16
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-GISEL-NEXT:    s_and_b32 s1, s12, 0xffff
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
   %load_x = load i16, ptr addrspace(4) %gep_x
@@ -610,37 +1058,69 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
 }
 
 define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_remainder_x:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s8, s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB16_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB16_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_remainder_x:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB16_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB16_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_remainder_x:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB16_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB16_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s12, 16
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_remainder_x:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB16_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB16_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_remainder_x:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s8, s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB16_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB16_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s8, s8, 16
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_remainder_x:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB16_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB16_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s12, s12, 16
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
   %load = load i16, ptr addrspace(4) %gep
@@ -650,37 +1130,69 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
 }
 
 define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preloadremainder_y:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB17_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB17_0:
-; GFX942-NEXT:    s_and_b32 s0, s9, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preloadremainder_y:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB17_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB17_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preloadremainder_y:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB17_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB17_0:
-; GFX90a-NEXT:    s_and_b32 s0, s13, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preloadremainder_y:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB17_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB17_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s13, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preloadremainder_y:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB17_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB17_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preloadremainder_y:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB17_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB17_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
   %load = load i16, ptr addrspace(4) %gep
@@ -690,37 +1202,69 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
 }
 
 define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preloadremainder_z:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB18_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB18_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preloadremainder_z:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB18_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB18_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preloadremainder_z:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB18_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB18_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preloadremainder_z:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB18_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB18_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preloadremainder_z:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB18_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB18_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s9, s9, 16
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preloadremainder_z:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB18_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB18_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s13, s13, 16
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
   %load = load i16, ptr addrspace(4) %gep
@@ -730,45 +1274,85 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
 }
 
 define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preloadremainder_xyz:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB19_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB19_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX942-NEXT:    s_lshr_b32 s1, s8, 16
-; GFX942-NEXT:    s_and_b32 s4, s9, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s1
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    v_mov_b32_e32 v2, s0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preloadremainder_xyz:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB19_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB19_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX942-SDAG-NEXT:    s_lshr_b32 s1, s8, 16
+; GFX942-SDAG-NEXT:    s_and_b32 s4, s9, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preloadremainder_xyz:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB19_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB19_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT:    s_lshr_b32 s1, s12, 16
-; GFX90a-NEXT:    s_and_b32 s2, s13, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preloadremainder_xyz:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB19_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB19_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s1, s12, 16
+; GFX90a-SDAG-NEXT:    s_and_b32 s2, s13, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preloadremainder_xyz:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB19_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB19_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s8, s8, 16
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v1, s9, s9, 16
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preloadremainder_xyz:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB19_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB19_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s12, s12, 16
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v1, s13, s13, 16
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
   %load_x = load i16, ptr addrspace(4) %gep_x
@@ -787,35 +1371,65 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
 }
 
 define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) {
-; GFX942-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB20_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB20_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s15, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB20_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB20_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s15, 16
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB20_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB20_0:
-; GFX90a-NEXT:    s_load_dword s0, s[8:9], 0x1c
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[12:13]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB20_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB20_0:
+; GFX90a-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x1c
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[12:13]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB20_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB20_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s15, s15, 16
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB20_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB20_0:
+; GFX90a-GISEL-NEXT:    s_load_dword s0, s[8:9], 0x1c
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[12:13]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
   %load = load i16, ptr addrspace(4) %gep
@@ -827,38 +1441,71 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
 ; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
 
 define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
-; GFX942-LABEL: preload_block_max_user_sgprs:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s12, s[0:1], 0x28
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB21_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB21_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s12
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_max_user_sgprs:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s12, s[0:1], 0x28
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB21_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB21_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_max_user_sgprs:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_load_dword s14, s[4:5], 0x20
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB21_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB21_0:
-; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0x28
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_max_user_sgprs:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_load_dword s14, s[4:5], 0x20
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB21_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB21_0:
+; GFX90a-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_max_user_sgprs:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s12, s[0:1], 0x28
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB21_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB21_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_max_user_sgprs:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x20
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB21_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB21_0:
+; GFX90a-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -866,43 +1513,81 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
 }
 
 define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB22_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB22_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX942-NEXT:    s_and_b32 s1, s8, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s1
-; GFX942-NEXT:    v_mov_b32_e32 v2, s0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB22_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB22_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX942-SDAG-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB22_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB22_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT:    s_and_b32 s1, s12, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB22_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB22_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT:    s_and_b32 s1, s12, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB22_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB22_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s9, s9, 16
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB22_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB22_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s13, s13, 16
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s12, 0xffff
+; GFX90a-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
   %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 58f0b9657476c..1251d358e501f 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM,ASM-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM,ASM-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
 
 ; OBJ: preload_ptr_kernarg_header
 ; OBJ-COUNT-60: s_nop 0
@@ -22,17 +25,29 @@ define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
 ; OBJ: preload_i32_kernarg_header
 ; OBJ-COUNT-58: s_nop 0
 define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg %arg1) {
-; ASM-LABEL: preload_i32_kernarg_header:
-; ASM:         s_load_dwordx2 s[8:9], s[4:5], 0x0
-; ASM-NEXT:    s_load_dword s10, s[4:5], 0x8
-; ASM-NEXT:    s_waitcnt lgkmcnt(0)
-; ASM-NEXT:    s_branch .LBB1_0
-; ASM-NEXT:    .p2align 8
-; ASM-NEXT:  .LBB1_0:
-; ASM-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; ASM-NEXT:    v_mov_b32_e32 v2, s10
-; ASM-NEXT:    flat_store_dword v[0:1], v2
-; ASM-NEXT:    s_endpgm
+; ASM-SDAG-LABEL: preload_i32_kernarg_header:
+; ASM-SDAG:         s_load_dwordx2 s[8:9], s[4:5], 0x0
+; ASM-SDAG-NEXT:    s_load_dword s10, s[4:5], 0x8
+; ASM-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; ASM-SDAG-NEXT:    s_branch .LBB1_0
+; ASM-SDAG-NEXT:    .p2align 8
+; ASM-SDAG-NEXT:  .LBB1_0:
+; ASM-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; ASM-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; ASM-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; ASM-SDAG-NEXT:    s_endpgm
+;
+; ASM-GISEL-LABEL: preload_i32_kernarg_header:
+; ASM-GISEL:         s_load_dwordx2 s[8:9], s[4:5], 0x0
+; ASM-GISEL-NEXT:    s_load_dword s10, s[4:5], 0x8
+; ASM-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; ASM-GISEL-NEXT:    s_branch .LBB1_0
+; ASM-GISEL-NEXT:    .p2align 8
+; ASM-GISEL-NEXT:  .LBB1_0:
+; ASM-GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; ASM-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; ASM-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; ASM-GISEL-NEXT:    s_endpgm
     store i32 %arg1, ptr %arg
     ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index fe6378435a42e..6cf7ebe6ded12 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,217 +1,400 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942 %s
-
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a,GFX90a-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a,GFX90a-GISEL %s
 
 define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i8:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB0_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB0_0:
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i8:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB0_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB0_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_i8:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB0_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB0_0:
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i8:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB0_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB0_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i8:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB0_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB0_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i8:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB0_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB0_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i8_zext_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB1_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB1_0:
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i8_zext_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB1_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB1_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_i8_zext_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB1_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB1_0:
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i8_zext_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB1_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB1_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i8_zext_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB1_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB1_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i8_zext_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB1_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB1_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i16_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB2_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB2_0:
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i16_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB2_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB2_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_i16_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB2_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB2_0:
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i16_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB2_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB2_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i16_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB2_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB2_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i16_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB2_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB2_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i32_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB3_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB3_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i32_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB3_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB3_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_i32_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB3_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB3_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i32_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB3_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB3_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i32_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB3_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB3_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i32_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB3_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB3_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i32 %arg0, ptr addrspace(1) %out
   ret void
 }
 
 
 define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 {
-; GFX942-LABEL: i32_ptr1_i32_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB4_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB4_0:
-; GFX942-NEXT:    s_add_i32 s0, s2, s6
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i32_ptr1_i32_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB4_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB4_0:
+; GFX942-SDAG-NEXT:    s_add_i32 s0, s2, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB4_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB4_0:
-; GFX90a-NEXT:    s_add_i32 s0, s6, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i32_ptr1_i32_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB4_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB4_0:
+; GFX942-GISEL-NEXT:    s_add_i32 s0, s2, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB4_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB4_0:
+; GFX90a-SDAG-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB4_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB4_0:
+; GFX90a-GISEL-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 {
-; GFX942-LABEL: ptr1_i16_i16_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB5_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB5_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX942-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX942-NEXT:    s_add_i32 s0, s1, s0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i16_i16_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB5_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB5_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX942-SDAG-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX942-SDAG-NEXT:    s_add_i32 s0, s1, s0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB5_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB5_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT:    s_and_b32 s1, s8, 0xffff
-; GFX90a-NEXT:    s_add_i32 s0, s1, s0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i16_i16_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB5_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB5_0:
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v0, s4, s4, 16
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX942-GISEL-NEXT:    v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB5_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB5_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-SDAG-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-SDAG-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB5_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB5_0:
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v0, s8, s8, 16
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-GISEL-NEXT:    v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   %ext1 = zext i16 %arg1 to i32
   %add = add i32 %ext, %ext1
@@ -220,33 +403,61 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
 }
 
 define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 {
-; GFX942-LABEL: ptr1_v2i8_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB6_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB6_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_v2i8_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB6_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB6_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB6_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB6_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_v2i8_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB6_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB6_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB6_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB6_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB6_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB6_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
@@ -344,943 +555,1842 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
 
 
 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 {
-; GFX942-LABEL: v8i32_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB9_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB9_0:
-; GFX942-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    v_mov_b32_e32 v1, s9
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    v_mov_b32_e32 v2, s6
-; GFX942-NEXT:    v_mov_b32_e32 v3, s7
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v8i32_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB9_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB9_0:
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v8i32_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB9_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB9_0:
-; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT:    s_nop 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v8i32_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB9_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB9_0:
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v8i32_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB9_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB9_0:
+; GFX90a-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-SDAG-NEXT:    s_nop 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v8i32_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB9_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB9_0:
+; GFX90a-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[6:7], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 {
-; GFX942-LABEL: v3i16_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB10_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB10_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3] offset:4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v3i16_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB10_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB10_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v3i16_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB10_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB10_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v3i16_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB10_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB10_0:
+; GFX942-GISEL-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3i16_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB10_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB10_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3i16_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB10_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB10_0:
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 {
-; GFX942-LABEL: v3i32_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB11_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB11_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v3i32_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB11_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB11_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v3i32_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB11_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB11_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v3i32_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB11_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB11_0:
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT:    s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3i32_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB11_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB11_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3i32_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB11_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB11_0:
+; GFX90a-GISEL-NEXT:    s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT:    s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT:    s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 {
-; GFX942-LABEL: v3f32_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB12_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB12_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v3f32_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB12_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB12_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v3f32_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB12_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB12_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v3f32_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB12_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB12_0:
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT:    s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3f32_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB12_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB12_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3f32_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB12_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB12_0:
+; GFX90a-GISEL-NEXT:    s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT:    s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT:    s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 {
-; GFX942-LABEL: v5i8_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB13_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB13_0:
-; GFX942-NEXT:    s_lshr_b32 s1, s4, 24
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX942-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX942-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX942-NEXT:    s_or_b32 s1, s4, s1
-; GFX942-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX942-NEXT:    s_or_b32 s0, s0, s1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    global_store_byte v0, v1, s[2:3] offset:4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v5i8_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB13_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB13_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX942-SDAG-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX942-SDAG-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX942-SDAG-NEXT:    s_or_b32 s1, s4, s1
+; GFX942-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX942-SDAG-NEXT:    s_or_b32 s0, s0, s1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    global_store_byte v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v5i8_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB13_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB13_0:
-; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
-; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
-; GFX90a-NEXT:    s_or_b32 s1, s2, s1
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX90a-NEXT:    s_or_b32 s0, s0, s1
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v5i8_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB13_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB13_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s1, 0xffff, s4
+; GFX942-GISEL-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v5i8_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB13_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB13_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s1, s8, 24
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX90a-SDAG-NEXT:    s_bfe_u32 s2, s8, 0x80010
+; GFX90a-SDAG-NEXT:    s_or_b32 s1, s2, s1
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX90a-SDAG-NEXT:    s_or_b32 s0, s0, s1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v5i8_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB13_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB13_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s1, 0xffff, s8
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:3
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 {
-; GFX942-LABEL: v5f64_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB14_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB14_0:
-; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX942-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX942-NEXT:    v_mov_b32_e32 v1, s9
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    v_mov_b32_e32 v2, s6
-; GFX942-NEXT:    v_mov_b32_e32 v3, s7
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v5f64_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB14_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB14_0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v5f64_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB14_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB14_0:
-; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT:    s_nop 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v5f64_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB14_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB14_0:
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x60
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v8, v[0:1], s[2:3] offset:32
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v5f64_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB14_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB14_0:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-SDAG-NEXT:    s_nop 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v5f64_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB14_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB14_0:
+; GFX90a-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[6:7], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    global_store_dwordx2 v8, v[0:1], s[6:7] offset:32
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 {
-; GFX942-LABEL: v8i8_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB15_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB15_0:
-; GFX942-NEXT:    s_lshr_b32 s1, s5, 24
-; GFX942-NEXT:    s_and_b32 s0, s5, 0xffff
-; GFX942-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX942-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX942-NEXT:    s_or_b32 s1, s5, s1
-; GFX942-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX942-NEXT:    s_lshr_b32 s5, s4, 24
-; GFX942-NEXT:    s_or_b32 s0, s0, s1
-; GFX942-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX942-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX942-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX942-NEXT:    s_or_b32 s4, s4, s5
-; GFX942-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX942-NEXT:    s_or_b32 s1, s1, s4
-; GFX942-NEXT:    v_mov_b32_e32 v0, s1
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v8i8_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB15_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB15_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s1, s5, 24
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s5, 0xffff
+; GFX942-SDAG-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX942-SDAG-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX942-SDAG-NEXT:    s_or_b32 s1, s5, s1
+; GFX942-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX942-SDAG-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX942-SDAG-NEXT:    s_or_b32 s0, s0, s1
+; GFX942-SDAG-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX942-SDAG-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX942-SDAG-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX942-SDAG-NEXT:    s_or_b32 s4, s4, s5
+; GFX942-SDAG-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX942-SDAG-NEXT:    s_or_b32 s1, s1, s4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v8i8_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB15_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB15_0:
-; GFX90a-NEXT:    s_lshr_b32 s1, s9, 24
-; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT:    s_bfe_u32 s2, s9, 0x80010
-; GFX90a-NEXT:    s_or_b32 s1, s2, s1
-; GFX90a-NEXT:    s_lshr_b32 s2, s8, 24
-; GFX90a-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX90a-NEXT:    s_bfe_u32 s3, s8, 0x80010
-; GFX90a-NEXT:    s_and_b32 s0, s9, 0xffff
-; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX90a-NEXT:    s_or_b32 s2, s3, s2
-; GFX90a-NEXT:    s_or_b32 s0, s0, s1
-; GFX90a-NEXT:    s_and_b32 s1, s8, 0xffff
-; GFX90a-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX90a-NEXT:    s_or_b32 s1, s1, s2
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v8i8_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB15_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB15_0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v8i8_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB15_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB15_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s1, s9, 24
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX90a-SDAG-NEXT:    s_bfe_u32 s2, s9, 0x80010
+; GFX90a-SDAG-NEXT:    s_or_b32 s1, s2, s1
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s2, s8, 24
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX90a-SDAG-NEXT:    s_bfe_u32 s3, s8, 0x80010
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX90a-SDAG-NEXT:    s_or_b32 s2, s3, s2
+; GFX90a-SDAG-NEXT:    s_or_b32 s0, s0, s1
+; GFX90a-SDAG-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX90a-SDAG-NEXT:    s_or_b32 s1, s1, s2
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v8i8_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB15_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB15_0:
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 {
-; GFX942-LABEL: i64_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB16_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB16_0:
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i64_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB16_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB16_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i64_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB16_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB16_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i64_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB16_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB16_0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i64_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB16_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB16_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-SDAG-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i64_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB16_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB16_0:
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 {
-; GFX942-LABEL: f64_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB17_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB17_0:
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: f64_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB17_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB17_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: f64_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB17_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB17_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: f64_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB17_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB17_0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: f64_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB17_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB17_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-SDAG-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: f64_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB17_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB17_0:
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store double %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 {
-; GFX942-LABEL: half_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB18_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB18_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: half_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB18_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB18_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: half_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB18_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB18_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: half_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB18_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB18_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: half_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB18_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB18_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: half_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB18_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB18_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store half %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 {
-; GFX942-LABEL: bfloat_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB19_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB19_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: bfloat_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB19_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB19_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: bfloat_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB19_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB19_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: bfloat_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB19_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB19_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB19_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB19_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB19_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB19_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store bfloat %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 {
-; GFX942-LABEL: v2bfloat_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB20_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB20_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v2bfloat_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB20_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB20_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB20_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB20_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v2bfloat_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB20_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB20_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB20_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB20_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB20_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB20_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <2 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 {
-; GFX942-LABEL: v3bfloat_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB21_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB21_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3] offset:4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v3bfloat_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB21_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB21_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB21_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB21_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v3bfloat_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB21_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB21_0:
+; GFX942-GISEL-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB21_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB21_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB21_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB21_0:
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <3 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 {
-; GFX942-LABEL: v6bfloat_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB22_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB22_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v6bfloat_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB22_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB22_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB22_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB22_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v6bfloat_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB22_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB22_0:
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT:    s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB22_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB22_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB22_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB22_0:
+; GFX90a-GISEL-NEXT:    s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT:    s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT:    s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <6 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: half_v7bfloat_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB23_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB23_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    global_store_short v3, v0, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, s9
-; GFX942-NEXT:    global_store_short v3, v0, s[10:11] offset:12
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB23_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB23_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    global_store_short v3, v0, s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s9
+; GFX942-SDAG-NEXT:    global_store_short v3, v0, s[10:11] offset:12
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB23_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB23_0:
-; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-NEXT:    global_store_short v3, v0, s[6:7]
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s13
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    global_store_short v3, v0, s[0:1] offset:12
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB23_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB23_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11] offset:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11] offset:4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT:    s_lshr_b32 s5, s8, 16
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11] offset:6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11] offset:8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11] offset:10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[10:11] offset:12
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB23_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB23_0:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB23_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB23_0:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s2, s10, 16
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s3, s11, 16
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1] offset:2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1] offset:4
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s9, s12, 16
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1] offset:6
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1] offset:8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1] offset:10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[0:1] offset:12
+; GFX90a-GISEL-NEXT:    s_endpgm
   store half %in, ptr addrspace(1) %out
   store <7 x bfloat> %in2, ptr addrspace(1) %out2
   ret void
 }
 
 define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 {
-; GFX942-LABEL: i1_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB24_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB24_0:
-; GFX942-NEXT:    s_and_b32 s0, s4, 1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_byte v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i1_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB24_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB24_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i1_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB24_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB24_0:
-; GFX90a-NEXT:    s_and_b32 s0, s8, 1
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_byte v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i1_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB24_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB24_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i1_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB24_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB24_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i1_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB24_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB24_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i1 %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 {
-; GFX942-LABEL: fp128_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB25_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB25_0:
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v3, s9
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: fp128_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB25_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB25_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: fp128_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB25_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB25_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    v_mov_b32_e32 v3, s13
-; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: fp128_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB25_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB25_0:
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT:    s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT:    s_mov_b32 s7, s9
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: fp128_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB25_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB25_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: fp128_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB25_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB25_0:
+; GFX90a-GISEL-NEXT:    s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT:    s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT:    s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT:    s_mov_b32 s3, s13
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store fp128 %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 {
-; GFX942-LABEL: v7i8_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB26_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB26_0:
-; GFX942-NEXT:    s_lshr_b32 s1, s4, 24
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX942-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX942-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX942-NEXT:    s_or_b32 s1, s4, s1
-; GFX942-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX942-NEXT:    s_or_b32 s0, s0, s1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    global_store_byte_d16_hi v0, v1, s[2:3] offset:6
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3] offset:4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v7i8_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB26_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB26_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX942-SDAG-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX942-SDAG-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX942-SDAG-NEXT:    s_or_b32 s1, s4, s1
+; GFX942-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX942-SDAG-NEXT:    s_or_b32 s0, s0, s1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    global_store_byte_d16_hi v0, v1, s[2:3] offset:6
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v7i8_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB26_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB26_0:
-; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
-; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
-; GFX90a-NEXT:    s_or_b32 s1, s2, s1
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX90a-NEXT:    s_or_b32 s0, s0, s1
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    global_store_byte_d16_hi v0, v1, s[6:7] offset:6
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v7i8_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB26_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB26_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s6, 0xffff, s4
+; GFX942-GISEL-NEXT:    s_lshr_b32 s6, s6, 8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    s_and_b32 s8, 0xffff, s5
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX942-GISEL-NEXT:    s_lshr_b32 s8, s8, 8
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    s_lshr_b32 s1, s5, 16
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT:    global_store_byte v1, v0, s[2:3] offset:6
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB26_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB26_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s1, s8, 24
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX90a-SDAG-NEXT:    s_bfe_u32 s2, s8, 0x80010
+; GFX90a-SDAG-NEXT:    s_or_b32 s1, s2, s1
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX90a-SDAG-NEXT:    s_or_b32 s0, s0, s1
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    global_store_byte_d16_hi v0, v1, s[6:7] offset:6
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB26_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB26_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s2, 0xffff, s8
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    s_and_b32 s4, 0xffff, s9
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s4, s4, 8
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:3
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s1, s9, 16
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:5
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT:    global_store_byte v1, v0, s[6:7] offset:6
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <7 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 {
-; GFX942-LABEL: v7half_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB27_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB27_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s9
-; GFX942-NEXT:    global_store_short v3, v0, s[2:3] offset:12
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: v7half_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB27_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB27_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s9
+; GFX942-SDAG-NEXT:    global_store_short v3, v0, s[2:3] offset:12
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: v7half_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB27_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB27_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s13
-; GFX90a-NEXT:    global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: v7half_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB27_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB27_0:
+; GFX942-GISEL-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT:    s_lshr_b32 s4, s8, 16
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3] offset:12
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: v7half_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB27_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB27_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-SDAG-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: v7half_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB27_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB27_0:
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s0, s10, 16
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s1, s11, 16
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT:    s_lshr_b32 s2, s12, 16
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:6
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7] offset:12
+; GFX90a-GISEL-NEXT:    s_endpgm
   store <7 x half> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_i32_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB28_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB28_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
-; GFX942-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i16_i32_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB28_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB28_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i16_i32_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB28_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB28_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-NEXT:    global_store_dword v0, v1, s[10:11]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i16_i32_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB28_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB28_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB28_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB28_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB28_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB28_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[10:11]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store i32 %in2, ptr addrspace(1) %out2
   ret void
 }
 
 define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_v3i32_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB29_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB29_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    global_store_short v3, v4, s[2:3]
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB29_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB29_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT:    global_store_short v3, v4, s[2:3]
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB29_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB29_0:
-; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
-; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-NEXT:    global_store_short v3, v4, s[6:7]
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB29_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB29_0:
+; GFX942-GISEL-NEXT:    s_mov_b32 s12, s6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 s13, s7
+; GFX942-GISEL-NEXT:    s_mov_b32 s14, s8
+; GFX942-GISEL-NEXT:    global_store_short v3, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s14
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB29_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB29_0:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB29_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB29_0:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-GISEL-NEXT:    s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT:    s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT:    s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store <3 x i32> %in2, ptr addrspace(1) %out2
   ret void
 }
 
 define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_i16_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB30_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB30_0:
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    global_store_short_d16_hi v0, v1, s[6:7]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i16_i16_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB30_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB30_0:
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    global_store_short_d16_hi v0, v1, s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i16_i16_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB30_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB30_0:
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i16_i16_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB30_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB30_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v1, s4, s4, 16
+; GFX942-GISEL-NEXT:    global_store_short v2, v0, s[2:3]
+; GFX942-GISEL-NEXT:    global_store_short v2, v1, s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB30_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB30_0:
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB30_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB30_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v1, s8, s8, 16
+; GFX90a-GISEL-NEXT:    global_store_short v2, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    global_store_short v2, v1, s[10:11]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store i16 %in2, ptr addrspace(1) %out2
   ret void
 }
 
 define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_v2i8_kernel_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB31_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB31_0:
-; GFX942-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX942-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX942-NEXT:    s_bfe_u32 s1, s4, 0x80010
-; GFX942-NEXT:    s_or_b32 s0, s1, s0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB31_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB31_0:
+; GFX942-SDAG-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX942-SDAG-NEXT:    s_bfe_u32 s1, s4, 0x80010
+; GFX942-SDAG-NEXT:    s_or_b32 s0, s1, s0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB31_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB31_0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX90a-NEXT:    s_bfe_u32 s1, s8, 0x80010
-; GFX90a-NEXT:    s_or_b32 s0, s1, s0
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_short v0, v1, s[10:11]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB31_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB31_0:
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    v_alignbit_b32 v1, s4, s4, 16
+; GFX942-GISEL-NEXT:    global_store_short v2, v0, s[2:3]
+; GFX942-GISEL-NEXT:    global_store_short v2, v1, s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB31_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB31_0:
+; GFX90a-SDAG-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-SDAG-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX90a-SDAG-NEXT:    s_bfe_u32 s1, s8, 0x80010
+; GFX90a-SDAG-NEXT:    s_or_b32 s0, s1, s0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_short v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB31_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB31_0:
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT:    v_alignbit_b32 v1, s8, s8, 16
+; GFX90a-GISEL-NEXT:    global_store_short v2, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    global_store_short v2, v1, s[10:11]
+; GFX90a-GISEL-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store <2 x i8> %in2, ptr addrspace(1) %out2
   ret void
@@ -1289,74 +2399,138 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; The second argument is not expected to be preloaded with the current behavior.
 
 define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 {
-; GFX942-LABEL: i32_ptr1_i32_staggered_preload_arg:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB32_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB32_0:
-; GFX942-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_add_i32 s0, s2, s3
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB32_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB32_0:
+; GFX942-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_add_i32 s0, s2, s3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB32_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB32_0:
-; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_add_i32 s2, s6, s2
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB32_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB32_0:
+; GFX942-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_add_i32 s0, s2, s3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB32_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB32_0:
+; GFX90a-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_add_i32 s2, s6, s2
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB32_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB32_0:
+; GFX90a-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_add_i32 s2, s6, s2
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 {
-; GFX942-LABEL: ptr1_i8_trailing_unused:
-; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_branch .LBB33_0
-; GFX942-NEXT:    .p2align 8
-; GFX942-NEXT:  ; %bb.2:
-; GFX942-NEXT:  .LBB33_0:
-; GFX942-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i8_trailing_unused:
+; GFX942-SDAG:       ; %bb.1:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_branch .LBB33_0
+; GFX942-SDAG-NEXT:    .p2align 8
+; GFX942-SDAG-NEXT:  ; %bb.2:
+; GFX942-SDAG-NEXT:  .LBB33_0:
+; GFX942-SDAG-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
-; GFX90a-LABEL: ptr1_i8_trailing_unused:
-; GFX90a:       ; %bb.1:
-; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT:    s_branch .LBB33_0
-; GFX90a-NEXT:    .p2align 8
-; GFX90a-NEXT:  ; %bb.2:
-; GFX90a-NEXT:  .LBB33_0:
-; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-NEXT:    s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i8_trailing_unused:
+; GFX942-GISEL:       ; %bb.1:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_branch .LBB33_0
+; GFX942-GISEL-NEXT:    .p2align 8
+; GFX942-GISEL-NEXT:  ; %bb.2:
+; GFX942-GISEL-NEXT:  .LBB33_0:
+; GFX942-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i8_trailing_unused:
+; GFX90a-SDAG:       ; %bb.1:
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT:    s_branch .LBB33_0
+; GFX90a-SDAG-NEXT:    .p2align 8
+; GFX90a-SDAG-NEXT:  ; %bb.2:
+; GFX90a-SDAG-NEXT:  .LBB33_0:
+; GFX90a-SDAG-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT:    s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i8_trailing_unused:
+; GFX90a-GISEL:       ; %bb.1:
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT:    s_branch .LBB33_0
+; GFX90a-GISEL-NEXT:    .p2align 8
+; GFX90a-GISEL-NEXT:  ; %bb.2:
+; GFX90a-GISEL-NEXT:  .LBB33_0:
+; GFX90a-GISEL-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void



More information about the llvm-commits mailing list