[llvm] [AMDGPU][GlobalISel] Enable kernel argument preloading (PR #134655)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 16 00:27:28 PDT 2025
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/134655
>From 729ed42e2a882ae3e86af52c67a7dfbd9126dca4 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Apr 2025 11:39:52 +0000
Subject: [PATCH 1/4] [AMDGPU][GlobalISel] Enable kernel argument preloading
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 97 +++++++++--
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h | 4 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 9 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 157 +++++++++++-------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
5 files changed, 187 insertions(+), 81 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a15f193549936..c9cf0c8fbcb7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,8 +20,10 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/MC/MCRegister.h"
#define DEBUG_TYPE "amdgpu-call-lowering"
@@ -497,6 +499,66 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
+void AMDGPUCallLowering::lowerPreloadedParameter(
+ MachineIRBuilder &B, ArrayRef<Register> VRegs, Type *ArgTy,
+ uint64_t ArgOffset, Align Alignment,
+ ArrayRef<MCRegister> PreloadRegs) const {
+ MachineFunction &MF = B.getMF();
+ const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const DataLayout &DL = B.getDataLayout();
+
+ LLT ResTy = getLLTForType(*ArgTy, DL);
+ LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy));
+ unsigned TotalSize = 0;
+ SmallVector<Register> SrcRegs(PreloadRegs.size());
+
+ for (auto [Idx, PhysReg] : enumerate(PreloadRegs)) {
+ Register VReg = MRI.getLiveInVirtReg(PhysReg);
+ TypeSize RegSize = TRI->getRegSizeInBits(VReg, MRI);
+
+ if (!MRI.getVRegDef(VReg)) {
+ MRI.setType(VReg, LLT::scalar(RegSize));
+ B.getMBB().addLiveIn(PhysReg);
+ B.buildInstr(TargetOpcode::COPY).addDef(VReg).addReg(PhysReg);
+ }
+
+ constexpr const unsigned SGPRSize = 4;
+ // Arg is preloaded into the previous SGPR.
+ if (DL.getTypeStoreSize(ArgTy) < SGPRSize && Alignment < SGPRSize) {
+ int64_t AlignDownOffset = alignDown(ArgOffset, SGPRSize);
+ int64_t OffsetDiff = ArgOffset - AlignDownOffset;
+ auto ShiftAmt = B.buildConstant(LLT::scalar(32), OffsetDiff * 8);
+ auto Shift = B.buildRotateLeft(LLT::scalar(RegSize), VReg, ShiftAmt);
+
+ if (ResTy.isVector())
+ B.buildBitcast(VRegs[0], B.buildTrunc(ScalarTy, Shift));
+ else
+ B.buildTrunc(VRegs[0], Shift);
+
+ return;
+ }
+
+ TotalSize += RegSize;
+ SrcRegs[Idx] = VReg;
+ }
+
+ LLT MergeTy = LLT::scalar(TotalSize);
+ Register Res = SrcRegs.back();
+
+ if (SrcRegs.size() > 1)
+ Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0);
+
+ if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits())
+ Res = B.buildTrunc(ScalarTy, Res).getReg(0);
+
+ if (ResTy.isVector())
+ Res = B.buildBitcast(ResTy, Res).getReg(0);
+
+ MRI.replaceRegWith(Res, VRegs[0]);
+}
+
bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -513,6 +575,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
+ if (Subtarget->hasKernargPreload())
+ TLI.allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
+
unsigned i = 0;
const Align KernArgBaseAlign(16);
const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
@@ -520,12 +585,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
// TODO: Align down to dword alignment and extract bits for extending loads.
for (auto &Arg : F.args()) {
- // TODO: Add support for kernarg preload.
- if (Arg.hasAttribute("amdgpu-hidden-argument")) {
- LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
- return false;
- }
-
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
@@ -559,13 +618,29 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
}
- } else {
- ArgInfo OrigArg(VRegs[i], Arg, i);
- const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
- setArgFlags(OrigArg, OrigArgIdx, DL, F);
- lowerParameter(B, OrigArg, ArgOffset, Alignment);
+ i++;
+ continue;
}
+ auto &PreloadKernArgs = Info->getArgInfo().PreloadKernArgs;
+ auto PreloadKernArg =
+ Arg.hasInRegAttr() ? PreloadKernArgs.find(i) : PreloadKernArgs.end();
+ if (PreloadKernArg != PreloadKernArgs.end()) {
+ lowerPreloadedParameter(B, VRegs[i], ArgTy, ArgOffset, Alignment,
+ PreloadKernArg->getSecond().Regs);
+ ++i;
+ continue;
+ }
+
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ F.getContext().diagnose(DiagnosticInfoUnsupported(
+ F, "hidden argument in kernel signature was not preloaded",
+ B.getDL()));
+
+ ArgInfo OrigArg(VRegs[i], Arg, i);
+ const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
+ setArgFlags(OrigArg, OrigArgIdx, DL, F);
+ lowerParameter(B, OrigArg, ArgOffset, Alignment);
++i;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..74fce411e2851 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -30,6 +30,10 @@ class AMDGPUCallLowering final : public CallLowering {
void lowerParameter(MachineIRBuilder &B, ArgInfo &AI, uint64_t Offset,
Align Alignment) const;
+ void lowerPreloadedParameter(MachineIRBuilder &B, ArrayRef<Register> VRegs,
+ Type *ArgTy, uint64_t ArgOffset, Align Alignment,
+ ArrayRef<MCRegister> PreloadRegs) const;
+
bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
bool IsVarArg) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 533ad349f7500..653f18062405c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1207,8 +1207,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
uint64_t ExplicitArgOffset = 0;
const DataLayout &DL = Fn.getDataLayout();
- unsigned InIndex = 0;
-
for (const Argument &Arg : Fn.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *BaseArgTy = Arg.getType();
@@ -1297,10 +1295,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
unsigned PartOffset = 0;
for (unsigned i = 0; i != NumRegs; ++i) {
- State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
- BasePartOffset + PartOffset,
- MemVT.getSimpleVT(),
- CCValAssign::Full));
+ State.addLoc(CCValAssign::getCustomMem(
+ Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset,
+ MemVT.getSimpleVT(), CCValAssign::Full));
PartOffset += MemVT.getStoreSize();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bd95bcd89e183..fa11812d25677 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2536,84 +2536,115 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
+static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
+ uint64_t ArgOffset, unsigned ArgSize,
+ unsigned Idx, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info, CCState &CCInfo) {
+ GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+ const Align KernelArgBaseAlign = Align(16);
+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+ constexpr const unsigned SGPRSize = 4;
+ unsigned NumAllocSGPRs = alignTo(ArgSize, SGPRSize) / SGPRSize;
+
+ // Arg is preloaded into the previous SGPR.
+ if (ArgSize < SGPRSize && Alignment < SGPRSize) {
+ assert(Idx >= 1 && "No previous SGPR");
+ AMDGPUFunctionArgInfo &ArgInfo = Info.getArgInfo();
+ auto &ArgDesc = ArgInfo.PreloadKernArgs[Idx];
+ auto &PrevArgDesc = ArgInfo.PreloadKernArgs[Idx - 1];
+ ArgDesc.Regs.push_back(PrevArgDesc.Regs[0]);
+ return true;
+ }
+
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, SGPRSize) / SGPRSize;
+ // Check for free user SGPRs for preloading.
+ if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs())
+ return false;
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
+
+ if (PreloadRegs->size() > 1)
+ RC = &AMDGPU::SGPR_32RegClass;
+
+ for (MCRegister Reg : *PreloadRegs) {
+ assert(Reg);
+ MF.addLiveIn(Reg, RC);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ LastExplicitArgOffset = NumAllocSGPRs * SGPRSize + ArgOffset;
+ return true;
+}
+
// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
- CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
Function &F = MF.getFunction();
- unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
- GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
- bool InPreloadSequence = true;
- unsigned InIdx = 0;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
+ uint64_t ExplicitArgOffset = BaseOffset;
+ uint64_t LastExplicitArgOffset = ExplicitArgOffset;
+ unsigned LocIdx = 0;
bool AlignedForImplictArgs = false;
unsigned ImplicitArgOffset = 0;
+
for (auto &Arg : F.args()) {
- if (!InPreloadSequence || !Arg.hasInRegAttr())
+ if (!Arg.hasInRegAttr())
break;
- unsigned ArgIdx = Arg.getArgNo();
- // Don't preload non-original args or parts not in the current preload
- // sequence.
- if (InIdx < Ins.size() &&
- (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+ if (AllocSize == 0)
break;
- for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
- Ins[InIdx].getOrigArgIndex() == ArgIdx;
- InIdx++) {
- assert(ArgLocs[ArgIdx].isMemLoc());
- auto &ArgLoc = ArgLocs[InIdx];
- const Align KernelArgBaseAlign = Align(16);
- unsigned ArgOffset = ArgLoc.getLocMemOffset();
- Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
- unsigned NumAllocSGPRs =
- alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
-
- // Fix alignment for hidden arguments.
- if (Arg.hasAttribute("amdgpu-hidden-argument")) {
- if (!AlignedForImplictArgs) {
- ImplicitArgOffset =
- alignTo(LastExplicitArgOffset,
- Subtarget->getAlignmentForImplicitArgPtr()) -
- LastExplicitArgOffset;
- AlignedForImplictArgs = true;
- }
- ArgOffset += ImplicitArgOffset;
- }
+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
+ Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
- // Arg is preloaded into the previous SGPR.
- if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
- assert(InIdx >= 1 && "No previous SGPR");
- Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
- Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
- continue;
- }
+ // Fix alignment for hidden arguments.
+ if (Arg.hasAttribute("amdgpu-hidden-argument") && !AlignedForImplictArgs) {
+ ImplicitArgOffset = alignTo(LastExplicitArgOffset,
+ Subtarget->getAlignmentForImplicitArgPtr()) -
+ LastExplicitArgOffset;
+ AlignedForImplictArgs = true;
+ }
- unsigned Padding = ArgOffset - LastExplicitArgOffset;
- unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
- // Check for free user SGPRs for preloading.
- if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
- InPreloadSequence = false;
- break;
- }
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
- // Preload this argument.
- const TargetRegisterClass *RC =
- TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
- SmallVectorImpl<MCRegister> *PreloadRegs =
- Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
-
- if (PreloadRegs->size() > 1)
- RC = &AMDGPU::SGPR_32RegClass;
- for (auto &Reg : *PreloadRegs) {
- assert(Reg);
- MF.addLiveIn(Reg, RC);
- CCInfo.AllocateReg(Reg);
- }
+ if (ArgLocs.empty()) {
+ // global isel
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ ArgOffset += ImplicitArgOffset;
- LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+ if (!allocPreloadKernArg(LastExplicitArgOffset, ArgOffset, AllocSize,
+ Arg.getArgNo(), MF, TRI, Info, CCInfo))
+ return; // no more available sgprs
+ } else {
+ // DAG isel
+ for (; LocIdx < ArgLocs.size() &&
+ ArgLocs[LocIdx].getValNo() == Arg.getArgNo();
+ LocIdx++) {
+ CCValAssign &ArgLoc = ArgLocs[LocIdx];
+ assert(ArgLoc.isMemLoc());
+ uint64_t LocOffset = ArgLoc.getLocMemOffset();
+ unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ LocOffset += ImplicitArgOffset;
+
+ if (!allocPreloadKernArg(LastExplicitArgOffset, LocOffset, LocSize,
+ LocIdx, MF, TRI, Info, CCInfo))
+ return; // no more available sgprs
+ }
}
}
}
@@ -2935,7 +2966,7 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
if (IsKernel && Subtarget->hasKernargPreload())
- allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dc0634331caf9..1efab6598dc5b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -576,7 +576,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void allocatePreloadKernArgSGPRs(CCState &CCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
>From 78799e090f78ca1eed0413ee61d35cbbb894327b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 10 Apr 2025 16:13:29 +0000
Subject: [PATCH 2/4] update tests
---
.../amdhsa-kernarg-preload-num-sgprs.ll | 6 +-
...alid-hidden-kernarg-in-kernel-signature.ll | 10 +-
.../AMDGPU/preload-implicit-kernargs.ll | 1963 ++++++----
.../CodeGen/AMDGPU/preload-kernarg-header.ll | 43 +-
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 3289 +++++++++++------
5 files changed, 3608 insertions(+), 1703 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..17d23c4746f69 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM %s
; OBJDUMP: Contents of section .rodata:
; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll b/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll
index 8344cf9265397..ed5791ec20da7 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll
@@ -1,17 +1,15 @@
-; RUN: not llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
-; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -global-isel=1 -global-isel-abort=2 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s
-; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
+; RUN: not llc -global-isel=1 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
+; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR %s
define amdgpu_kernel void @no_free_sgprs_block_count_x_no_preload_diag(ptr addrspace(1) inreg %out, i512 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
-; GISEL: warning: Instruction selection used fallback path for no_free_sgprs_block_count_x_no_preload_diag
; ERROR: error: <unknown>:0:0: in function no_free_sgprs_block_count_x_no_preload_diag void (ptr addrspace(1), i512, i32): hidden argument in kernel signature was not preloaded
store i32 %_hidden_block_count_x, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @preloadremainder_z_no_preload_diag(ptr addrspace(1) inreg %out, i256 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
-; GISEL: warning: Instruction selection used fallback path for preloadremainder_z_no_preload_diag
; ERROR: error: <unknown>:0:0: in function preloadremainder_z_no_preload_diag void (ptr addrspace(1), i256, i32, i32, i32, i16, i16, i16, i16, i16, i16): hidden argument in kernel signature was not preloaded
%conv = zext i16 %_hidden_remainder_z to i32
store i32 %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index c26f0926d86b2..537021f20c73d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,35 +1,65 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a,GFX90a-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a,GFX90a-GISEL %s
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_x:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB0_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB0_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_x:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB0_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB0_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB0_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_x:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB0_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB0_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_x:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB0_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB0_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_x:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB0_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB0_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -37,34 +67,65 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
}
define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 {
-; GFX942-LABEL: preload_unused_arg_block_count_x:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB1_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB1_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s6
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_unused_arg_block_count_x:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB1_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB1_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_unused_arg_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB1_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s12
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_unused_arg_block_count_x:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB1_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB1_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_unused_arg_block_count_x:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB1_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB1_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_unused_arg_block_count_x:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB1_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB1_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -72,35 +133,65 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
}
define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) {
-; GFX942-LABEL: no_free_sgprs_block_count_x:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB2_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB2_0:
-; GFX942-NEXT: s_load_dword s0, s[4:5], 0x28
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: no_free_sgprs_block_count_x:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB2_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB2_0:
+; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x28
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: no_free_sgprs_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB2_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x28
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: no_free_sgprs_block_count_x:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB2_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB2_0:
+; GFX90a-SDAG-NEXT: s_load_dword s0, s[8:9], 0x28
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: no_free_sgprs_block_count_x:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB2_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB2_0:
+; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x28
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: no_free_sgprs_block_count_x:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB2_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB2_0:
+; GFX90a-GISEL-NEXT: s_load_dword s0, s[8:9], 0x28
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[12:13]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -108,25 +199,45 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
}
define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
-; GFX942-LABEL: no_inreg_block_count_x:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: no_inreg_block_count_x:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: no_inreg_block_count_x:
-; GFX90a: ; %bb.0:
-; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: no_inreg_block_count_x:
+; GFX90a-SDAG: ; %bb.0:
+; GFX90a-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: no_inreg_block_count_x:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: no_inreg_block_count_x:
+; GFX90a-GISEL: ; %bb.0:
+; GFX90a-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -137,25 +248,45 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
; args are inreg (preloaded).
define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 {
-; GFX942-LABEL: mixed_inreg_block_count_x:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: mixed_inreg_block_count_x:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: mixed_inreg_block_count_x:
-; GFX90a: ; %bb.0:
-; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: mixed_inreg_block_count_x:
+; GFX90a-SDAG: ; %bb.0:
+; GFX90a-SDAG-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: mixed_inreg_block_count_x:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: mixed_inreg_block_count_x:
+; GFX90a-GISEL: ; %bb.0:
+; GFX90a-GISEL-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -180,7 +311,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
@@ -190,7 +321,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
@@ -199,35 +330,65 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
}
define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: incorrect_type_i16_block_count_x:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB6_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB6_0:
-; GFX942-NEXT: s_load_dword s0, s[0:1], 0x8
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: incorrect_type_i16_block_count_x:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB6_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB6_0:
+; GFX942-SDAG-NEXT: s_load_dword s0, s[0:1], 0x8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: incorrect_type_i16_block_count_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB6_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB6_0:
-; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: incorrect_type_i16_block_count_x:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB6_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB6_0:
+; GFX90a-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: incorrect_type_i16_block_count_x:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB6_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB6_0:
+; GFX942-GISEL-NEXT: s_load_dword s0, s[0:1], 0x8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: incorrect_type_i16_block_count_x:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB6_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB6_0:
+; GFX90a-GISEL-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
store i16 %load, ptr addrspace(1) %out
@@ -235,32 +396,61 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
}
define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_y:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB7_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB7_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_y:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB7_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB7_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_count_y:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB7_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB7_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_y:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB7_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB7_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_y:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB7_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB7_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_y:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB7_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB7_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
%load = load i32, ptr addrspace(4) %gep
@@ -269,37 +459,67 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
}
define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: random_incorrect_offset:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB8_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB8_0:
-; GFX942-NEXT: s_mov_b32 s4, 8
-; GFX942-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: random_incorrect_offset:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB8_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB8_0:
+; GFX942-SDAG-NEXT: s_mov_b32 s4, 8
+; GFX942-SDAG-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: random_incorrect_offset:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB8_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB8_0:
-; GFX90a-NEXT: s_mov_b32 s0, 8
-; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: random_incorrect_offset:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB8_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB8_0:
+; GFX90a-SDAG-NEXT: s_mov_b32 s0, 8
+; GFX90a-SDAG-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: random_incorrect_offset:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB8_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB8_0:
+; GFX942-GISEL-NEXT: s_load_dword s0, s[0:1], 0xa
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: random_incorrect_offset:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB8_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB8_0:
+; GFX90a-GISEL-NEXT: s_load_dword s0, s[4:5], 0xa
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
%load = load i32, ptr addrspace(4) %gep
@@ -308,34 +528,65 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
}
define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_z:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB9_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB9_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s6
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_z:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB9_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB9_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_count_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB9_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s12
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_z:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB9_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB9_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_z:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB9_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB9_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_z:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB9_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB9_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%load = load i32, ptr addrspace(4) %gep
@@ -344,38 +595,73 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
}
define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 {
-; GFX942-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB10_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB10_0:
-; GFX942-NEXT: s_and_b32 s0, s4, 0xff
-; GFX942-NEXT: s_add_i32 s0, s6, s0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB10_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB10_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT: s_add_i32 s0, s6, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB10_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
-; GFX90a-NEXT: s_add_i32 s0, s12, s0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB10_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB10_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB10_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB10_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT: s_add_i32 s0, s6, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB10_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB10_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
%ext = zext i8 %val to i32
@@ -385,38 +671,73 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
}
define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_xyz:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB11_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB11_0:
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_xyz:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB11_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB11_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_count_xyz:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB11_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_xyz:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB11_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB11_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_xyz:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB11_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB11_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_xyz:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB11_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB11_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
%load_x = load i32, ptr addrspace(4) %gep_x
@@ -432,35 +753,65 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
}
define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_x:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB12_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB12_0:
-; GFX942-NEXT: s_and_b32 s0, s7, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_x:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB12_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB12_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s7, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_workgroup_size_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB12_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_x:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB12_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB12_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_x:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB12_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB12_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s7, 0xffff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_x:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB12_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB12_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
%load = load i16, ptr addrspace(4) %gep
@@ -470,35 +821,65 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
}
define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_y:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB13_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB13_0:
-; GFX942-NEXT: s_lshr_b32 s0, s7, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_y:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB13_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB13_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_workgroup_size_y:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB13_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_y:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB13_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB13_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_y:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB13_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB13_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s7, s7, 16
+; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_y:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB13_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB13_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s11, s11, 16
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
%load = load i16, ptr addrspace(4) %gep
@@ -508,37 +889,69 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
}
define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_z:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s8, s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB14_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB14_0:
-; GFX942-NEXT: s_and_b32 s0, s8, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_z:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s8, s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB14_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB14_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_workgroup_size_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB14_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_z:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB14_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB14_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_z:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s8, s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB14_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB14_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_z:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB14_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB14_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
%load = load i16, ptr addrspace(4) %gep
@@ -548,45 +961,85 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
}
define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_workgroup_size_xyz:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s8, s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB15_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB15_0:
-; GFX942-NEXT: s_lshr_b32 s0, s7, 16
-; GFX942-NEXT: s_and_b32 s1, s7, 0xffff
-; GFX942-NEXT: s_and_b32 s4, s8, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_workgroup_size_xyz:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s8, s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB15_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB15_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX942-SDAG-NEXT: s_and_b32 s1, s7, 0xffff
+; GFX942-SDAG-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_workgroup_size_xyz:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB15_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff
-; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_workgroup_size_xyz:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB15_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB15_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s11, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_workgroup_size_xyz:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s8, s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB15_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB15_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s7, s7, 16
+; GFX942-GISEL-NEXT: s_and_b32 s0, s7, 0xffff
+; GFX942-GISEL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX942-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_workgroup_size_xyz:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB15_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB15_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s11, s11, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-GISEL-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
%load_x = load i16, ptr addrspace(4) %gep_x
@@ -605,37 +1058,69 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
}
define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_remainder_x:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s8, s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB16_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB16_0:
-; GFX942-NEXT: s_lshr_b32 s0, s8, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_remainder_x:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s8, s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB16_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB16_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s8, 16
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_remainder_x:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB16_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s14, 16
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_remainder_x:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB16_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB16_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_remainder_x:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s8, s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB16_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB16_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s8, s8, 16
+; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_remainder_x:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB16_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB16_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s12, s12, 16
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
%load = load i16, ptr addrspace(4) %gep
@@ -645,35 +1130,69 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
}
define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preloadremainder_y:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB17_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB17_0:
-; GFX942-NEXT: s_and_b32 s0, s9, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preloadremainder_y:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB17_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB17_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preloadremainder_y:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB17_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preloadremainder_y:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB17_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB17_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preloadremainder_y:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB17_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB17_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preloadremainder_y:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB17_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB17_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
%load = load i16, ptr addrspace(4) %gep
@@ -683,35 +1202,69 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
}
define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preloadremainder_z:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB18_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB18_0:
-; GFX942-NEXT: s_lshr_b32 s0, s9, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preloadremainder_z:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB18_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB18_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s9, 16
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preloadremainder_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB18_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preloadremainder_z:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB18_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB18_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preloadremainder_z:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB18_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB18_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s9, s9, 16
+; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preloadremainder_z:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB18_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB18_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s13, s13, 16
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
%load = load i16, ptr addrspace(4) %gep
@@ -721,43 +1274,85 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
}
define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preloadremainder_xyz:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB19_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB19_0:
-; GFX942-NEXT: s_lshr_b32 s0, s9, 16
-; GFX942-NEXT: s_lshr_b32 s1, s8, 16
-; GFX942-NEXT: s_and_b32 s4, s9, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preloadremainder_xyz:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB19_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB19_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s9, 16
+; GFX942-SDAG-NEXT: s_lshr_b32 s1, s8, 16
+; GFX942-SDAG-NEXT: s_and_b32 s4, s9, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preloadremainder_xyz:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB19_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
-; GFX90a-NEXT: s_lshr_b32 s1, s14, 16
-; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preloadremainder_xyz:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB19_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB19_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s12, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preloadremainder_xyz:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB19_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB19_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s8, s8, 16
+; GFX942-GISEL-NEXT: v_alignbit_b32 v1, s9, s9, 16
+; GFX942-GISEL-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preloadremainder_xyz:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB19_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB19_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s12, s12, 16
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s13, s13, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
%load_x = load i16, ptr addrspace(4) %gep_x
@@ -776,35 +1371,65 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
}
define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) {
-; GFX942-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB20_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB20_0:
-; GFX942-NEXT: s_lshr_b32 s0, s15, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB20_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB20_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s15, 16
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB20_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB20_0:
-; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x1c
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB20_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB20_0:
+; GFX90a-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1c
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s0, 16
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB20_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB20_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s15, s15, 16
+; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB20_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB20_0:
+; GFX90a-GISEL-NEXT: s_load_dword s0, s[8:9], 0x1c
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s0, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[12:13]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
%load = load i16, ptr addrspace(4) %gep
@@ -816,35 +1441,71 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
-; GFX942-LABEL: preload_block_max_user_sgprs:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s12, s[0:1], 0x28
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB21_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB21_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s12
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_max_user_sgprs:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s12, s[0:1], 0x28
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB21_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB21_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s12
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_max_user_sgprs:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB21_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB21_0:
-; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_max_user_sgprs:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB21_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB21_0:
+; GFX90a-SDAG-NEXT: s_load_dword s0, s[4:5], 0x28
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_max_user_sgprs:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s12, s[0:1], 0x28
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB21_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB21_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_max_user_sgprs:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB21_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB21_0:
+; GFX90a-GISEL-NEXT: s_load_dword s0, s[4:5], 0x28
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -852,45 +1513,81 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
}
define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
-; GFX942-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB22_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB22_0:
-; GFX942-NEXT: s_lshr_b32 s0, s9, 16
-; GFX942-NEXT: s_and_b32 s1, s8, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s1
-; GFX942-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB22_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB22_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s9, 16
+; GFX942-SDAG-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB22_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c
-; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
-; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX90a-SDAG-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB22_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB22_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB22_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB22_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s9, s9, 16
+; GFX942-GISEL-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB22_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB22_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s13, s13, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 58f0b9657476c..1251d358e501f 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,7 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM,ASM-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM,ASM-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
; OBJ: preload_ptr_kernarg_header
; OBJ-COUNT-60: s_nop 0
@@ -22,17 +25,29 @@ define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
; OBJ: preload_i32_kernarg_header
; OBJ-COUNT-58: s_nop 0
define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg %arg1) {
-; ASM-LABEL: preload_i32_kernarg_header:
-; ASM: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; ASM-NEXT: s_load_dword s10, s[4:5], 0x8
-; ASM-NEXT: s_waitcnt lgkmcnt(0)
-; ASM-NEXT: s_branch .LBB1_0
-; ASM-NEXT: .p2align 8
-; ASM-NEXT: .LBB1_0:
-; ASM-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; ASM-NEXT: v_mov_b32_e32 v2, s10
-; ASM-NEXT: flat_store_dword v[0:1], v2
-; ASM-NEXT: s_endpgm
+; ASM-SDAG-LABEL: preload_i32_kernarg_header:
+; ASM-SDAG: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; ASM-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
+; ASM-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; ASM-SDAG-NEXT: s_branch .LBB1_0
+; ASM-SDAG-NEXT: .p2align 8
+; ASM-SDAG-NEXT: .LBB1_0:
+; ASM-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; ASM-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; ASM-SDAG-NEXT: flat_store_dword v[0:1], v2
+; ASM-SDAG-NEXT: s_endpgm
+;
+; ASM-GISEL-LABEL: preload_i32_kernarg_header:
+; ASM-GISEL: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; ASM-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
+; ASM-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; ASM-GISEL-NEXT: s_branch .LBB1_0
+; ASM-GISEL-NEXT: .p2align 8
+; ASM-GISEL-NEXT: .LBB1_0:
+; ASM-GISEL-NEXT: v_mov_b32_e32 v2, s10
+; ASM-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; ASM-GISEL-NEXT: flat_store_dword v[0:1], v2
+; ASM-GISEL-NEXT: s_endpgm
store i32 %arg1, ptr %arg
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 7ae0c11dca279..6cf7ebe6ded12 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,216 +1,400 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942 %s
-
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a,GFX90a-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a,GFX90a-GISEL %s
define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i8:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB0_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB0_0:
-; GFX942-NEXT: s_and_b32 s0, s4, 0xff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i8:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB0_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB0_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_i8:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB0_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i8:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB0_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB0_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i8:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB0_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB0_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i8:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB0_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB0_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i8_zext_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB1_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB1_0:
-; GFX942-NEXT: s_and_b32 s0, s4, 0xff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i8_zext_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB1_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB1_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_i8_zext_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB1_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i8_zext_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB1_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB1_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i8_zext_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB1_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB1_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i8_zext_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB1_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB1_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i16_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB2_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB2_0:
-; GFX942-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i16_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB2_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB2_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_i16_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB2_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i16_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB2_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB2_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i16_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB2_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB2_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i16_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB2_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB2_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 {
-; GFX942-LABEL: ptr1_i32_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB3_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB3_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i32_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB3_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB3_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_i32_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB3_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB3_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i32_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB3_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB3_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i32_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB3_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB3_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i32_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB3_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB3_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 {
-; GFX942-LABEL: i32_ptr1_i32_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_load_dword s6, s[0:1], 0x10
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB4_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB4_0:
-; GFX942-NEXT: s_add_i32 s0, s2, s6
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[4:5]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i32_ptr1_i32_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB4_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB4_0:
+; GFX942-SDAG-NEXT: s_add_i32 s0, s2, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB4_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB4_0:
-; GFX90a-NEXT: s_add_i32 s0, s8, s12
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i32_ptr1_i32_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dword s6, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB4_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB4_0:
+; GFX942-GISEL-NEXT: s_add_i32 s0, s2, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB4_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB4_0:
+; GFX90a-SDAG-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB4_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB4_0:
+; GFX90a-GISEL-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX90a-GISEL-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 {
-; GFX942-LABEL: ptr1_i16_i16_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB5_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB5_0:
-; GFX942-NEXT: s_lshr_b32 s0, s4, 16
-; GFX942-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX942-NEXT: s_add_i32 s0, s1, s0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i16_i16_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB5_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB5_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s4, 16
+; GFX942-SDAG-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX942-SDAG-NEXT: s_add_i32 s0, s1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB5_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB5_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s10, 16
-; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
-; GFX90a-NEXT: s_add_i32 s0, s1, s0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i16_i16_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB5_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB5_0:
+; GFX942-GISEL-NEXT: v_alignbit_b32 v0, s4, s4, 16
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX942-GISEL-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB5_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB5_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_add_i32 s0, s1, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB5_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB5_0:
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s8, s8, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-GISEL-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
%add = add i32 %ext, %ext1
@@ -219,33 +403,61 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
}
define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 {
-; GFX942-LABEL: ptr1_v2i8_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB6_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB6_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_v2i8_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB6_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB6_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB6_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB6_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_v2i8_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB6_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB6_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB6_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB6_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB6_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB6_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
}
@@ -273,7 +485,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
@@ -284,9 +496,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -319,7 +531,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -330,9 +542,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -343,926 +555,1842 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 {
-; GFX942-LABEL: v8i32_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB9_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB9_0:
-; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: v_mov_b32_e32 v1, s9
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v8i32_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB9_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB9_0:
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v8i32_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB9_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
-; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v0, s16
-; GFX90a-NEXT: v_mov_b32_e32 v1, s17
-; GFX90a-NEXT: v_mov_b32_e32 v2, s18
-; GFX90a-NEXT: v_mov_b32_e32 v3, s19
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v8i32_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB9_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB9_0:
+; GFX942-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v8i32_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB9_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB9_0:
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-SDAG-NEXT: s_nop 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v8i32_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB9_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB9_0:
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX90a-GISEL-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 {
-; GFX942-LABEL: v3i16_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB10_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB10_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: global_store_short v0, v1, s[2:3] offset:4
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v3i16_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB10_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB10_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v3i16_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB10_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v3i16_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB10_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB10_0:
+; GFX942-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3i16_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB10_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB10_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3i16_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB10_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB10_0:
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 {
-; GFX942-LABEL: v3i32_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB11_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB11_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v3i32_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB11_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB11_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v3i32_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB11_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v3i32_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB11_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB11_0:
+; GFX942-GISEL-NEXT: s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT: s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT: s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3i32_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB11_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB11_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3i32_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB11_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB11_0:
+; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 {
-; GFX942-LABEL: v3f32_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB12_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB12_0:
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v3f32_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB12_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB12_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v3f32_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB12_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v3f32_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB12_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB12_0:
+; GFX942-GISEL-NEXT: s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT: s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT: s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3f32_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB12_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB12_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3f32_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB12_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB12_0:
+; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 {
-; GFX942-LABEL: v5i8_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB13_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB13_0:
-; GFX942-NEXT: s_lshr_b32 s1, s4, 24
-; GFX942-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX942-NEXT: s_lshl_b32 s1, s1, 8
-; GFX942-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX942-NEXT: s_or_b32 s1, s4, s1
-; GFX942-NEXT: s_lshl_b32 s1, s1, 16
-; GFX942-NEXT: s_or_b32 s0, s0, s1
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: global_store_byte v0, v1, s[2:3] offset:4
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v5i8_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB13_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB13_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s1, s4, 24
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX942-SDAG-NEXT: s_lshl_b32 s1, s1, 8
+; GFX942-SDAG-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX942-SDAG-NEXT: s_or_b32 s1, s4, s1
+; GFX942-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; GFX942-SDAG-NEXT: s_or_b32 s0, s0, s1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: global_store_byte v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v5i8_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB13_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
-; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
-; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
-; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
-; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v5i8_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB13_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB13_0:
+; GFX942-GISEL-NEXT: s_and_b32 s1, 0xffff, s4
+; GFX942-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT: s_lshr_b32 s6, s0, 8
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v5i8_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB13_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB13_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 8
+; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_or_b32 s1, s2, s1
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; GFX90a-SDAG-NEXT: s_or_b32 s0, s0, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v5i8_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB13_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB13_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s1, 0xffff, s8
+; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s0, 8
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:3
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 {
-; GFX942-LABEL: v5f64_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB14_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB14_0:
-; GFX942-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX942-NEXT: v_mov_b32_e32 v1, s9
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v5f64_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB14_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB14_0:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v5f64_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB14_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
-; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: v_mov_b32_e32 v0, s16
-; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32
-; GFX90a-NEXT: v_mov_b32_e32 v1, s17
-; GFX90a-NEXT: v_mov_b32_e32 v2, s18
-; GFX90a-NEXT: v_mov_b32_e32 v3, s19
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v5f64_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB14_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB14_0:
+; GFX942-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x60
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: global_store_dwordx2 v8, v[0:1], s[2:3] offset:32
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v5f64_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB14_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB14_0:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-SDAG-NEXT: s_nop 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v5f64_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB14_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB14_0:
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v8, v[0:1], s[6:7] offset:32
+; GFX90a-GISEL-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 {
-; GFX942-LABEL: v8i8_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB15_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB15_0:
-; GFX942-NEXT: s_lshr_b32 s1, s5, 24
-; GFX942-NEXT: s_and_b32 s0, s5, 0xffff
-; GFX942-NEXT: s_lshl_b32 s1, s1, 8
-; GFX942-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX942-NEXT: s_or_b32 s1, s5, s1
-; GFX942-NEXT: s_lshl_b32 s1, s1, 16
-; GFX942-NEXT: s_lshr_b32 s5, s4, 24
-; GFX942-NEXT: s_or_b32 s0, s0, s1
-; GFX942-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX942-NEXT: s_lshl_b32 s5, s5, 8
-; GFX942-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX942-NEXT: s_or_b32 s4, s4, s5
-; GFX942-NEXT: s_lshl_b32 s4, s4, 16
-; GFX942-NEXT: s_or_b32 s1, s1, s4
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v8i8_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB15_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB15_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s1, s5, 24
+; GFX942-SDAG-NEXT: s_and_b32 s0, s5, 0xffff
+; GFX942-SDAG-NEXT: s_lshl_b32 s1, s1, 8
+; GFX942-SDAG-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX942-SDAG-NEXT: s_or_b32 s1, s5, s1
+; GFX942-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; GFX942-SDAG-NEXT: s_lshr_b32 s5, s4, 24
+; GFX942-SDAG-NEXT: s_or_b32 s0, s0, s1
+; GFX942-SDAG-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX942-SDAG-NEXT: s_lshl_b32 s5, s5, 8
+; GFX942-SDAG-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX942-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX942-SDAG-NEXT: s_lshl_b32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_or_b32 s1, s1, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v8i8_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB15_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s11, 24
-; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010
-; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_lshr_b32 s2, s10, 24
-; GFX90a-NEXT: s_lshl_b32 s2, s2, 8
-; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010
-; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
-; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
-; GFX90a-NEXT: s_or_b32 s2, s3, s2
-; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
-; GFX90a-NEXT: s_lshl_b32 s2, s2, 16
-; GFX90a-NEXT: s_or_b32 s1, s1, s2
-; GFX90a-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v8i8_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB15_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB15_0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v8i8_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB15_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB15_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s9, 24
+; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 8
+; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s9, 0x80010
+; GFX90a-SDAG-NEXT: s_or_b32 s1, s2, s1
+; GFX90a-SDAG-NEXT: s_lshr_b32 s2, s8, 24
+; GFX90a-SDAG-NEXT: s_lshl_b32 s2, s2, 8
+; GFX90a-SDAG-NEXT: s_bfe_u32 s3, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; GFX90a-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX90a-SDAG-NEXT: s_or_b32 s0, s0, s1
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_lshl_b32 s2, s2, 16
+; GFX90a-SDAG-NEXT: s_or_b32 s1, s1, s2
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v8i8_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB15_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB15_0:
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 {
-; GFX942-LABEL: i64_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB16_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB16_0:
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i64_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB16_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB16_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i64_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB16_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i64_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB16_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB16_0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i64_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB16_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB16_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i64_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB16_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB16_0:
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 {
-; GFX942-LABEL: f64_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB17_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB17_0:
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: f64_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB17_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB17_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: f64_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB17_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: f64_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB17_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB17_0:
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: f64_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB17_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB17_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: f64_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB17_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB17_0:
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 {
-; GFX942-LABEL: half_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB18_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB18_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: half_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB18_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB18_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: half_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB18_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: half_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB18_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB18_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: half_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB18_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB18_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: half_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB18_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB18_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 {
-; GFX942-LABEL: bfloat_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB19_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB19_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: bfloat_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB19_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB19_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: bfloat_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB19_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: bfloat_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB19_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB19_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB19_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB19_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB19_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB19_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 {
-; GFX942-LABEL: v2bfloat_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB20_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB20_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v2bfloat_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB20_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB20_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB20_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB20_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v2bfloat_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB20_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB20_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB20_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB20_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB20_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB20_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 {
-; GFX942-LABEL: v3bfloat_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB21_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB21_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: global_store_short v0, v1, s[2:3] offset:4
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v3bfloat_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB21_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB21_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB21_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB21_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v3bfloat_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB21_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB21_0:
+; GFX942-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB21_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB21_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB21_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB21_0:
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 {
-; GFX942-LABEL: v6bfloat_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB22_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB22_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v6bfloat_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB22_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB22_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB22_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v6bfloat_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB22_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB22_0:
+; GFX942-GISEL-NEXT: s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT: s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT: s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB22_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB22_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB22_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB22_0:
+; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: half_v7bfloat_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB23_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB23_0:
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: global_store_short v3, v0, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, s9
-; GFX942-NEXT: global_store_short v3, v0, s[10:11] offset:12
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB23_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB23_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT: global_store_short v3, v0, s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s9
+; GFX942-SDAG-NEXT: global_store_short v3, v0, s[10:11] offset:12
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB23_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB23_0:
-; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: global_store_short v3, v0, s[8:9]
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v0, s3
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB23_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB23_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_lshr_b32 s0, s6, 16
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11] offset:2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s7
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11] offset:4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT: s_lshr_b32 s5, s8, 16
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11] offset:6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11] offset:8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11] offset:10
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[10:11] offset:12
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB23_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB23_0:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB23_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB23_0:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s10, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: s_lshr_b32 s3, s11, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:4
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-GISEL-NEXT: s_lshr_b32 s9, s12, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:6
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:12
+; GFX90a-GISEL-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 {
-; GFX942-LABEL: i1_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB24_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB24_0:
-; GFX942-NEXT: s_and_b32 s0, s4, 1
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_byte v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i1_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB24_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB24_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i1_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB24_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB24_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 1
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_byte v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i1_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB24_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB24_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i1_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB24_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB24_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i1_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB24_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB24_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 {
-; GFX942-LABEL: fp128_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB25_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB25_0:
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: v_mov_b32_e32 v3, s9
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: fp128_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB25_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB25_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: fp128_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB25_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB25_0:
-; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: fp128_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB25_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB25_0:
+; GFX942-GISEL-NEXT: s_mov_b32 s4, s6
+; GFX942-GISEL-NEXT: s_mov_b32 s5, s7
+; GFX942-GISEL-NEXT: s_mov_b32 s6, s8
+; GFX942-GISEL-NEXT: s_mov_b32 s7, s9
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: fp128_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB25_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB25_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: fp128_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB25_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB25_0:
+; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT: s_mov_b32 s3, s13
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 {
-; GFX942-LABEL: v7i8_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB26_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB26_0:
-; GFX942-NEXT: s_lshr_b32 s1, s4, 24
-; GFX942-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX942-NEXT: s_lshl_b32 s1, s1, 8
-; GFX942-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX942-NEXT: s_or_b32 s1, s4, s1
-; GFX942-NEXT: s_lshl_b32 s1, s1, 16
-; GFX942-NEXT: s_or_b32 s0, s0, s1
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:6
-; GFX942-NEXT: global_store_short v0, v1, s[2:3] offset:4
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v7i8_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB26_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB26_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s1, s4, 24
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX942-SDAG-NEXT: s_lshl_b32 s1, s1, 8
+; GFX942-SDAG-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX942-SDAG-NEXT: s_or_b32 s1, s4, s1
+; GFX942-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; GFX942-SDAG-NEXT: s_or_b32 s0, s0, s1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:6
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3] offset:4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v7i8_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB26_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB26_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
-; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
-; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
-; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
-; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v7i8_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB26_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB26_0:
+; GFX942-GISEL-NEXT: s_and_b32 s6, 0xffff, s4
+; GFX942-GISEL-NEXT: s_lshr_b32 s6, s6, 8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: s_lshr_b32 s7, s0, 8
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: s_and_b32 s8, 0xffff, s5
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s7
+; GFX942-GISEL-NEXT: s_lshr_b32 s8, s8, 8
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: s_lshr_b32 s1, s5, 16
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT: global_store_byte v1, v0, s[2:3] offset:6
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB26_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB26_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 8
+; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_or_b32 s1, s2, s1
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; GFX90a-SDAG-NEXT: s_or_b32 s0, s0, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB26_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB26_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s2, 0xffff, s8
+; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s2, 8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: s_lshr_b32 s3, s0, 8
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: s_and_b32 s4, 0xffff, s9
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-GISEL-NEXT: s_lshr_b32 s4, s4, 8
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:3
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s9, 16
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:5
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:6
+; GFX90a-GISEL-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 {
-; GFX942-LABEL: v7half_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB27_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB27_0:
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, s9
-; GFX942-NEXT: global_store_short v3, v0, s[2:3] offset:12
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: v7half_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB27_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB27_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s9
+; GFX942-SDAG-NEXT: global_store_short v3, v0, s[2:3] offset:12
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: v7half_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB27_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB27_0:
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s15
-; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: v7half_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB27_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB27_0:
+; GFX942-GISEL-NEXT: s_lshr_b32 s0, s6, 16
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s7
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-GISEL-NEXT: s_lshr_b32 s4, s8, 16
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:10
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3] offset:12
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: v7half_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB27_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB27_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: v7half_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB27_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB27_0:
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s11, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s12, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:6
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:12
+; GFX90a-GISEL-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_i32_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB28_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB28_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
-; GFX942-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i16_i32_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB28_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB28_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i16_i32_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB28_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB28_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i16_i32_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB28_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB28_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_short v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB28_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB28_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB28_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB28_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[10:11]
+; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_v3i32_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB29_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB29_0:
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
-; GFX942-NEXT: v_mov_b32_e32 v0, s6
-; GFX942-NEXT: v_mov_b32_e32 v1, s7
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: global_store_short v3, v4, s[2:3]
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB29_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB29_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-SDAG-NEXT: global_store_short v3, v4, s[2:3]
+; GFX942-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB29_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB29_0:
-; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
-; GFX90a-NEXT: v_mov_b32_e32 v4, s10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_short v3, v4, s[8:9]
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB29_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB29_0:
+; GFX942-GISEL-NEXT: s_mov_b32 s12, s6
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT: s_mov_b32 s13, s7
+; GFX942-GISEL-NEXT: s_mov_b32 s14, s8
+; GFX942-GISEL-NEXT: global_store_short v3, v0, s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s14
+; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB29_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB29_0:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB29_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB29_0:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_i16_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB30_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB30_0:
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[6:7]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i16_i16_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB30_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB30_0:
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: global_store_short_d16_hi v0, v1, s[6:7]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i16_i16_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB30_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB30_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i16_i16_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB30_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB30_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT: v_alignbit_b32 v1, s4, s4, 16
+; GFX942-GISEL-NEXT: global_store_short v2, v0, s[2:3]
+; GFX942-GISEL-NEXT: global_store_short v2, v1, s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB30_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB30_0:
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB30_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB30_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s8, s8, 16
+; GFX90a-GISEL-NEXT: global_store_short v2, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v2, v1, s[10:11]
+; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
-; GFX942-LABEL: i16_v2i8_kernel_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB31_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB31_0:
-; GFX942-NEXT: s_lshr_b32 s0, s4, 24
-; GFX942-NEXT: s_lshl_b32 s0, s0, 8
-; GFX942-NEXT: s_bfe_u32 s1, s4, 0x80010
-; GFX942-NEXT: s_or_b32 s0, s1, s0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: global_store_short v0, v1, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_short v0, v1, s[6:7]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB31_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB31_0:
+; GFX942-SDAG-NEXT: s_lshr_b32 s0, s4, 24
+; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 8
+; GFX942-SDAG-NEXT: s_bfe_u32 s1, s4, 0x80010
+; GFX942-SDAG-NEXT: s_or_b32 s0, s1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB31_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB31_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s10, 24
-; GFX90a-NEXT: s_lshl_b32 s0, s0, 8
-; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010
-; GFX90a-NEXT: s_or_b32 s0, s1, s0
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[12:13]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB31_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB31_0:
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT: v_alignbit_b32 v1, s4, s4, 16
+; GFX942-GISEL-NEXT: global_store_short v2, v0, s[2:3]
+; GFX942-GISEL-NEXT: global_store_short v2, v1, s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB31_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB31_0:
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-SDAG-NEXT: s_lshl_b32 s0, s0, 8
+; GFX90a-SDAG-NEXT: s_bfe_u32 s1, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_or_b32 s0, s1, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB31_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB31_0:
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s8, s8, 16
+; GFX90a-GISEL-NEXT: global_store_short v2, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v2, v1, s[10:11]
+; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
ret void
@@ -1271,73 +2399,138 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
; The second argument is not expected to be preloaded with the current behavior.
define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 {
-; GFX942-LABEL: i32_ptr1_i32_staggered_preload_arg:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB32_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB32_0:
-; GFX942-NEXT: s_load_dword s3, s[0:1], 0x10
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_add_i32 s0, s2, s3
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[4:5]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB32_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB32_0:
+; GFX942-SDAG-NEXT: s_load_dword s3, s[0:1], 0x10
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_add_i32 s0, s2, s3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB32_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB32_0:
-; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_add_i32 s2, s8, s2
-; GFX90a-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB32_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB32_0:
+; GFX942-GISEL-NEXT: s_load_dword s3, s[0:1], 0x10
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_add_i32 s0, s2, s3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB32_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB32_0:
+; GFX90a-SDAG-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB32_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB32_0:
+; GFX90a-GISEL-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 {
-; GFX942-LABEL: ptr1_i8_trailing_unused:
-; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_branch .LBB33_0
-; GFX942-NEXT: .p2align 8
-; GFX942-NEXT: ; %bb.2:
-; GFX942-NEXT: .LBB33_0:
-; GFX942-NEXT: s_and_b32 s0, s4, 0xff
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: ptr1_i8_trailing_unused:
+; GFX942-SDAG: ; %bb.1:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_branch .LBB33_0
+; GFX942-SDAG-NEXT: .p2align 8
+; GFX942-SDAG-NEXT: ; %bb.2:
+; GFX942-SDAG-NEXT: .LBB33_0:
+; GFX942-SDAG-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
;
-; GFX90a-LABEL: ptr1_i8_trailing_unused:
-; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_branch .LBB33_0
-; GFX90a-NEXT: .p2align 8
-; GFX90a-NEXT: ; %bb.2:
-; GFX90a-NEXT: .LBB33_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
-; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX90a-NEXT: s_endpgm
+; GFX942-GISEL-LABEL: ptr1_i8_trailing_unused:
+; GFX942-GISEL: ; %bb.1:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_branch .LBB33_0
+; GFX942-GISEL-NEXT: .p2align 8
+; GFX942-GISEL-NEXT: ; %bb.2:
+; GFX942-GISEL-NEXT: .LBB33_0:
+; GFX942-GISEL-NEXT: s_and_b32 s0, s4, 0xff
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX90a-SDAG-LABEL: ptr1_i8_trailing_unused:
+; GFX90a-SDAG: ; %bb.1:
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_branch .LBB33_0
+; GFX90a-SDAG-NEXT: .p2align 8
+; GFX90a-SDAG-NEXT: ; %bb.2:
+; GFX90a-SDAG-NEXT: .LBB33_0:
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: s_endpgm
+;
+; GFX90a-GISEL-LABEL: ptr1_i8_trailing_unused:
+; GFX90a-GISEL: ; %bb.1:
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_branch .LBB33_0
+; GFX90a-GISEL-NEXT: .p2align 8
+; GFX90a-GISEL-NEXT: ; %bb.2:
+; GFX90a-GISEL-NEXT: .LBB33_0:
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
>From c2fac73bafd7eba043da5a8d2a7ec8c49210d50b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 16 Apr 2025 07:09:10 +0000
Subject: [PATCH 3/4] clang format
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index c9cf0c8fbcb7b..a3b5d4b62f31a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -513,17 +513,17 @@ void AMDGPUCallLowering::lowerPreloadedParameter(
LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy));
unsigned TotalSize = 0;
SmallVector<Register> SrcRegs(PreloadRegs.size());
-
+
for (auto [Idx, PhysReg] : enumerate(PreloadRegs)) {
Register VReg = MRI.getLiveInVirtReg(PhysReg);
TypeSize RegSize = TRI->getRegSizeInBits(VReg, MRI);
-
+
if (!MRI.getVRegDef(VReg)) {
MRI.setType(VReg, LLT::scalar(RegSize));
B.getMBB().addLiveIn(PhysReg);
B.buildInstr(TargetOpcode::COPY).addDef(VReg).addReg(PhysReg);
}
-
+
constexpr const unsigned SGPRSize = 4;
// Arg is preloaded into the previous SGPR.
if (DL.getTypeStoreSize(ArgTy) < SGPRSize && Alignment < SGPRSize) {
>From f4a2d6a1114e38e43807f318f46a770d083212db Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 16 Apr 2025 07:25:02 +0000
Subject: [PATCH 4/4] update tests
---
.../AMDGPU/preload-implicit-kernargs.ll | 359 ++++----
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 844 ++++++++----------
2 files changed, 549 insertions(+), 654 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 537021f20c73d..6dbab775b5187 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -21,16 +21,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
;
; GFX90a-SDAG-LABEL: preload_block_count_x:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB0_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB0_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_count_x:
@@ -49,16 +49,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
;
; GFX90a-GISEL-LABEL: preload_block_count_x:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB0_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB0_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -84,17 +84,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
;
; GFX90a-SDAG-LABEL: preload_unused_arg_block_count_x:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB1_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB1_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_unused_arg_block_count_x:
@@ -114,17 +113,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
;
; GFX90a-GISEL-LABEL: preload_unused_arg_block_count_x:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB1_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB1_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -150,7 +148,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
;
; GFX90a-SDAG-LABEL: no_free_sgprs_block_count_x:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB2_0
; GFX90a-SDAG-NEXT: .p2align 8
@@ -160,7 +158,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: no_free_sgprs_block_count_x:
@@ -180,7 +178,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
;
; GFX90a-GISEL-LABEL: no_free_sgprs_block_count_x:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB2_0
; GFX90a-GISEL-NEXT: .p2align 8
@@ -190,7 +188,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[12:13]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[14:15]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -311,7 +309,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
@@ -321,7 +319,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
@@ -347,7 +345,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
;
; GFX90a-SDAG-LABEL: incorrect_type_i16_block_count_x:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB6_0
; GFX90a-SDAG-NEXT: .p2align 8
@@ -357,7 +355,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: incorrect_type_i16_block_count_x:
@@ -377,7 +375,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
;
; GFX90a-GISEL-LABEL: incorrect_type_i16_block_count_x:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB6_0
; GFX90a-GISEL-NEXT: .p2align 8
@@ -387,7 +385,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
@@ -412,16 +410,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
;
; GFX90a-SDAG-LABEL: preload_block_count_y:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB7_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB7_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_count_y:
@@ -440,16 +437,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
;
; GFX90a-GISEL-LABEL: preload_block_count_y:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB7_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB7_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
@@ -477,7 +473,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
;
; GFX90a-SDAG-LABEL: random_incorrect_offset:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB8_0
; GFX90a-SDAG-NEXT: .p2align 8
@@ -488,7 +484,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: random_incorrect_offset:
@@ -508,7 +504,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
;
; GFX90a-GISEL-LABEL: random_incorrect_offset:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB8_0
; GFX90a-GISEL-NEXT: .p2align 8
@@ -518,7 +514,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
@@ -545,17 +541,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
;
; GFX90a-SDAG-LABEL: preload_block_count_z:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB9_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB9_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_count_z:
@@ -575,17 +570,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
;
; GFX90a-GISEL-LABEL: preload_block_count_z:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB9_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB9_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
@@ -614,19 +608,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
;
; GFX90a-SDAG-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB10_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB10_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
-; GFX90a-SDAG-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-SDAG-NEXT: s_add_i32 s0, s12, s0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_count_x_imparg_align_ptr_i8:
@@ -648,19 +641,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
;
; GFX90a-GISEL-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB10_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB10_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
-; GFX90a-GISEL-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-GISEL-NEXT: s_add_i32 s0, s12, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -690,19 +682,18 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
;
; GFX90a-SDAG-LABEL: preload_block_count_xyz:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB11_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB11_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_count_xyz:
@@ -724,19 +715,21 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
;
; GFX90a-GISEL-LABEL: preload_block_count_xyz:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB11_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB11_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
+; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
+; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
@@ -770,17 +763,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
;
; GFX90a-SDAG-LABEL: preload_workgroup_size_x:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB12_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB12_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s13, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_workgroup_size_x:
@@ -800,17 +793,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
;
; GFX90a-GISEL-LABEL: preload_workgroup_size_x:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB12_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB12_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s13, 0xffff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -838,17 +831,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
;
; GFX90a-SDAG-LABEL: preload_workgroup_size_y:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB13_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB13_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_workgroup_size_y:
@@ -868,17 +861,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
;
; GFX90a-GISEL-LABEL: preload_workgroup_size_y:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB13_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB13_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s11, s11, 16
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s13, s13, 16
; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
@@ -907,18 +900,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
;
; GFX90a-SDAG-LABEL: preload_workgroup_size_z:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB14_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB14_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s14, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_workgroup_size_z:
@@ -939,18 +932,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
;
; GFX90a-GISEL-LABEL: preload_workgroup_size_z:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB14_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB14_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s14, 0xffff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
@@ -983,22 +976,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
;
; GFX90a-SDAG-LABEL: preload_workgroup_size_xyz:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB15_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB15_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s11, 16
-; GFX90a-SDAG-NEXT: s_and_b32 s1, s11, 0xffff
-; GFX90a-SDAG-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s13, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s2, s14, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_workgroup_size_xyz:
@@ -1023,22 +1016,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
;
; GFX90a-GISEL-LABEL: preload_workgroup_size_xyz:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB15_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB15_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s11, s11, 16
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s11, 0xffff
-; GFX90a-GISEL-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s13, s13, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT: s_and_b32 s1, s14, 0xffff
; GFX90a-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -1076,18 +1069,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-SDAG-LABEL: preload_remainder_x:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB16_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB16_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s14, 16
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_remainder_x:
@@ -1108,18 +1101,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-GISEL-LABEL: preload_remainder_x:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB16_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB16_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s12, s12, 16
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s14, s14, 16
; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -1148,18 +1141,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-SDAG-LABEL: preloadremainder_y:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB17_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB17_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s15, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preloadremainder_y:
@@ -1180,18 +1171,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-GISEL-LABEL: preloadremainder_y:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB17_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB17_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s15, 0xffff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
@@ -1220,18 +1209,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-SDAG-LABEL: preloadremainder_z:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB18_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB18_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s15, 16
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preloadremainder_z:
@@ -1252,18 +1239,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-GISEL-LABEL: preloadremainder_z:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB18_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB18_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s13, s13, 16
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s15, s15, 16
; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -1296,22 +1281,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
;
; GFX90a-SDAG-LABEL: preloadremainder_xyz:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB19_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB19_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s12, 16
-; GFX90a-SDAG-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s15, 16
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s14, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s2, s15, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preloadremainder_xyz:
@@ -1336,22 +1319,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
;
; GFX90a-GISEL-LABEL: preloadremainder_xyz:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB19_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB19_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s12, s12, 16
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s13, s13, 16
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s14, s14, 16
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s15, s15, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s15, 0xffff
; GFX90a-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX90a-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -1387,7 +1368,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
;
; GFX90a-SDAG-LABEL: no_free_sgprs_preloadremainder_z:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB20_0
; GFX90a-SDAG-NEXT: .p2align 8
@@ -1398,7 +1379,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: no_free_sgprs_preloadremainder_z:
@@ -1417,7 +1398,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
;
; GFX90a-GISEL-LABEL: no_free_sgprs_preloadremainder_z:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB20_0
; GFX90a-GISEL-NEXT: .p2align 8
@@ -1428,7 +1409,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[12:13]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[14:15]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -1458,10 +1439,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
;
; GFX90a-SDAG-LABEL: preload_block_max_user_sgprs:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-SDAG-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB21_0
; GFX90a-SDAG-NEXT: .p2align 8
@@ -1471,7 +1449,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_max_user_sgprs:
@@ -1491,10 +1469,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
;
; GFX90a-GISEL-LABEL: preload_block_max_user_sgprs:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-GISEL-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB21_0
; GFX90a-GISEL-NEXT: .p2align 8
@@ -1504,7 +1479,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -1534,21 +1509,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
;
; GFX90a-SDAG-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB22_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB22_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-SDAG-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-SDAG-NEXT: s_load_dword s0, s[4:5], 0x1c
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s14, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
@@ -1572,21 +1549,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
;
; GFX90a-GISEL-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB22_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB22_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s13, s13, 16
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s12, 0xffff
-; GFX90a-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-GISEL-NEXT: s_load_dword s0, s[4:5], 0x1c
+; GFX90a-GISEL-NEXT: s_and_b32 s13, s14, 0xffff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-GISEL-NEXT: s_lshr_b32 s14, s0, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 6cf7ebe6ded12..d64905e34663a 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -37,32 +37,32 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
;
; GFX90a-SDAG-LABEL: ptr1_i8:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB0_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB0_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_i8:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB0_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB0_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
@@ -102,32 +102,32 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
;
; GFX90a-SDAG-LABEL: ptr1_i8_zext_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB1_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB1_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_i8_zext_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB1_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB1_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -167,32 +167,32 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
;
; GFX90a-SDAG-LABEL: ptr1_i16_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB2_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB2_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_i16_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB2_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB2_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -230,30 +230,30 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
;
; GFX90a-SDAG-LABEL: ptr1_i32_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB3_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB3_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_i32_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB3_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB3_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
@@ -295,34 +295,32 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
;
; GFX90a-SDAG-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB4_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB4_0:
-; GFX90a-SDAG-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-SDAG-NEXT: s_add_i32 s0, s8, s12
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB4_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB4_0:
-; GFX90a-GISEL-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-GISEL-NEXT: s_add_i32 s0, s8, s12
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[10:11]
; GFX90a-GISEL-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
@@ -365,35 +363,35 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-SDAG-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB5_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB5_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-SDAG-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-SDAG-NEXT: s_add_i32 s0, s1, s0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB5_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB5_0:
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s8, s8, 16
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v0, s10, s10, 16
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-GISEL-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -433,30 +431,30 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
;
; GFX90a-SDAG-LABEL: ptr1_v2i8_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB6_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB6_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_v2i8_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB6_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB6_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
@@ -485,7 +483,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
@@ -496,9 +494,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -531,7 +529,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -542,9 +540,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -600,45 +598,45 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
;
; GFX90a-SDAG-LABEL: v8i32_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB9_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB9_0:
-; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-SDAG-NEXT: s_nop 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-SDAG-NEXT: s_nop 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v8i32_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB9_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB9_0:
-; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[14:15], s[14:15] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
-; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[18:19], s[18:19] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GFX90a-GISEL-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -682,37 +680,35 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-SDAG-LABEL: v3i16_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB10_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB10_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v3i16_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB10_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB10_0:
-; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:4
; GFX90a-GISEL-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
@@ -756,42 +752,20 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX90a-SDAG-LABEL: v3i32_preload_arg:
-; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-SDAG-NEXT: s_branch .LBB11_0
-; GFX90a-SDAG-NEXT: .p2align 8
-; GFX90a-SDAG-NEXT: ; %bb.2:
-; GFX90a-SDAG-NEXT: .LBB11_0:
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-SDAG-NEXT: s_endpgm
-;
-; GFX90a-GISEL-LABEL: v3i32_preload_arg:
-; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: s_branch .LBB11_0
-; GFX90a-GISEL-NEXT: .p2align 8
-; GFX90a-GISEL-NEXT: ; %bb.2:
-; GFX90a-GISEL-NEXT: .LBB11_0:
-; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
-; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
-; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-GISEL-NEXT: s_endpgm
+; GFX90a-LABEL: v3i32_preload_arg:
+; GFX90a: ; %bb.1:
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: s_branch .LBB11_0
+; GFX90a-NEXT: .p2align 8
+; GFX90a-NEXT: ; %bb.2:
+; GFX90a-NEXT: .LBB11_0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
@@ -836,39 +810,32 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-SDAG-LABEL: v3f32_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB12_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB12_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v3f32_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB12_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB12_0:
-; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
-; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
-; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
@@ -926,51 +893,49 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
;
; GFX90a-SDAG-LABEL: v5i8_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB13_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB13_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-SDAG-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-SDAG-NEXT: s_or_b32 s0, s0, s1
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_byte v0, v1, s[8:9] offset:4
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v5i8_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB13_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB13_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s1, 0xffff, s8
+; GFX90a-GISEL-NEXT: s_and_b32 s1, 0xffff, s10
; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s1, 8
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s0, 8
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:1
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:1
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:2
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:3
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:3
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:4
; GFX90a-GISEL-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -1029,51 +994,51 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
;
; GFX90a-SDAG-LABEL: v5f64_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB14_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB14_0:
; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-SDAG-NEXT: s_nop 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-SDAG-NEXT: s_nop 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v5f64_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB14_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB14_0:
-; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[14:15], s[14:15] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
-; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[18:19], s[18:19] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: global_store_dwordx2 v8, v[0:1], s[6:7] offset:32
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9] offset:32
; GFX90a-GISEL-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
@@ -1125,45 +1090,43 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
;
; GFX90a-SDAG-LABEL: v8i8_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB15_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB15_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s9, 24
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s11, 24
; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s9, 0x80010
+; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s11, 0x80010
; GFX90a-SDAG-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-SDAG-NEXT: s_lshr_b32 s2, s8, 24
+; GFX90a-SDAG-NEXT: s_lshr_b32 s2, s10, 24
; GFX90a-SDAG-NEXT: s_lshl_b32 s2, s2, 8
-; GFX90a-SDAG-NEXT: s_bfe_u32 s3, s8, 0x80010
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX90a-SDAG-NEXT: s_bfe_u32 s3, s10, 0x80010
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s11, 0xffff
; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-SDAG-NEXT: s_or_b32 s2, s3, s2
; GFX90a-SDAG-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-SDAG-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-SDAG-NEXT: s_lshl_b32 s2, s2, 16
; GFX90a-SDAG-NEXT: s_or_b32 s1, s1, s2
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v8i8_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB15_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB15_0:
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
@@ -1200,30 +1163,28 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
;
; GFX90a-SDAG-LABEL: i64_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB16_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB16_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i64_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB16_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB16_0:
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
@@ -1260,30 +1221,28 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
;
; GFX90a-SDAG-LABEL: f64_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB17_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB17_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-SDAG-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: f64_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB17_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB17_0:
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
@@ -1320,30 +1279,30 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-SDAG-LABEL: half_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB18_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB18_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: half_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB18_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB18_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
@@ -1380,30 +1339,30 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-SDAG-LABEL: bfloat_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB19_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB19_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: bfloat_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB19_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB19_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
@@ -1440,30 +1399,30 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-SDAG-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB20_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB20_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB20_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB20_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -1507,37 +1466,35 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-SDAG-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB21_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB21_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB21_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB21_0:
-; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:4
; GFX90a-GISEL-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -1581,42 +1538,20 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
; GFX942-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX90a-SDAG-LABEL: v6bfloat_kernel_preload_arg:
-; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-SDAG-NEXT: s_branch .LBB22_0
-; GFX90a-SDAG-NEXT: .p2align 8
-; GFX90a-SDAG-NEXT: ; %bb.2:
-; GFX90a-SDAG-NEXT: .LBB22_0:
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-SDAG-NEXT: s_endpgm
-;
-; GFX90a-GISEL-LABEL: v6bfloat_kernel_preload_arg:
-; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: s_branch .LBB22_0
-; GFX90a-GISEL-NEXT: .p2align 8
-; GFX90a-GISEL-NEXT: ; %bb.2:
-; GFX90a-GISEL-NEXT: .LBB22_0:
-; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
-; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
-; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-GISEL-NEXT: s_endpgm
+; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a: ; %bb.1:
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: s_branch .LBB22_0
+; GFX90a-NEXT: .p2align 8
+; GFX90a-NEXT: ; %bb.2:
+; GFX90a-NEXT: .LBB22_0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
}
@@ -1675,57 +1610,57 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
;
; GFX90a-SDAG-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB23_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB23_0:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[6:7]
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[0:1] offset:12
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[8:9]
+; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB23_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB23_0:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s10, 16
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX90a-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX90a-GISEL-NEXT: s_lshr_b32 s5, s1, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; GFX90a-GISEL-NEXT: s_lshr_b32 s11, s2, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:6
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-GISEL-NEXT: s_lshr_b32 s3, s11, 16
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:2
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:8
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:4
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; GFX90a-GISEL-NEXT: s_lshr_b32 s9, s12, 16
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:6
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:8
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:10
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[0:1] offset:12
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:12
; GFX90a-GISEL-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
@@ -1765,32 +1700,32 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
;
; GFX90a-SDAG-LABEL: i1_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB24_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB24_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 1
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_byte v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i1_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB24_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB24_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 1
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
@@ -1837,40 +1772,32 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-SDAG-LABEL: fp128_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB25_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB25_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: fp128_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB25_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB25_0:
-; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
-; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
-; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
-; GFX90a-GISEL-NEXT: s_mov_b32 s3, s13
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90a-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[14:15], s[14:15] op_sel:[0,1]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
@@ -1936,59 +1863,57 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-SDAG-LABEL: v7i8_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB26_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB26_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-SDAG-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-SDAG-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-SDAG-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-SDAG-NEXT: s_or_b32 s0, s0, s1
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9] offset:4
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v7i8_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB26_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB26_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s2, 0xffff, s8
+; GFX90a-GISEL-NEXT: s_and_b32 s2, 0xffff, s10
; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s2, 8
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX90a-GISEL-NEXT: s_lshr_b32 s3, s0, 8
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:1
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:1
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: s_and_b32 s4, 0xffff, s9
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:2
+; GFX90a-GISEL-NEXT: s_and_b32 s4, 0xffff, s11
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:2
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX90a-GISEL-NEXT: s_lshr_b32 s4, s4, 8
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:3
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
-; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s9, 16
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:3
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s11, 16
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:4
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:5
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:5
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[6:7] offset:6
+; GFX90a-GISEL-NEXT: global_store_byte v1, v0, s[8:9] offset:6
; GFX90a-GISEL-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
@@ -2046,51 +1971,47 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-SDAG-LABEL: v7half_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB27_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB27_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s15
+; GFX90a-SDAG-NEXT: global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: v7half_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB27_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB27_0:
-; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s10, 16
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-GISEL-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s11, 16
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:2
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX90a-GISEL-NEXT: s_lshr_b32 s1, s13, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:2
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:4
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s12, 16
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:6
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:8
+; GFX90a-GISEL-NEXT: s_lshr_b32 s2, s14, 16
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:6
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:8
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:10
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7] offset:12
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:10
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s15
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9] offset:12
; GFX90a-GISEL-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
@@ -2131,34 +2052,34 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-SDAG-LABEL: i16_i32_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB28_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB28_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[12:13]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i16_i32_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB28_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB28_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[6:7]
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s9
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[10:11]
+; GFX90a-GISEL-NEXT: global_store_short v1, v0, s[8:9]
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s11
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[12:13]
; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
@@ -2207,45 +2128,42 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
;
; GFX90a-SDAG-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB29_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB29_0:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-SDAG-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v4, s10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-SDAG-NEXT: global_store_short v3, v4, s[8:9]
+; GFX90a-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB29_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB29_0:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
-; GFX90a-GISEL-NEXT: s_mov_b32 s0, s10
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-GISEL-NEXT: s_mov_b32 s1, s11
-; GFX90a-GISEL-NEXT: s_mov_b32 s2, s12
-; GFX90a-GISEL-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_short v3, v0, s[8:9]
+; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
@@ -2286,33 +2204,33 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-SDAG-LABEL: i16_i16_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB30_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB30_0:
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-SDAG-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-SDAG-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i16_i16_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB30_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB30_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s8, s8, 16
-; GFX90a-GISEL-NEXT: global_store_short v2, v0, s[6:7]
-; GFX90a-GISEL-NEXT: global_store_short v2, v1, s[10:11]
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s10, s10, 16
+; GFX90a-GISEL-NEXT: global_store_short v2, v0, s[8:9]
+; GFX90a-GISEL-NEXT: global_store_short v2, v1, s[12:13]
; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
@@ -2358,38 +2276,38 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-SDAG-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB31_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB31_0:
-; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-SDAG-NEXT: s_lshr_b32 s0, s10, 24
; GFX90a-SDAG-NEXT: s_lshl_b32 s0, s0, 8
-; GFX90a-SDAG-NEXT: s_bfe_u32 s1, s8, 0x80010
+; GFX90a-SDAG-NEXT: s_bfe_u32 s1, s10, 0x80010
; GFX90a-SDAG-NEXT: s_or_b32 s0, s1, s0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[10:11]
+; GFX90a-SDAG-NEXT: global_store_short v0, v1, s[12:13]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB31_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB31_0:
-; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s8, s8, 16
-; GFX90a-GISEL-NEXT: global_store_short v2, v0, s[6:7]
-; GFX90a-GISEL-NEXT: global_store_short v2, v1, s[10:11]
+; GFX90a-GISEL-NEXT: v_alignbit_b32 v1, s10, s10, 16
+; GFX90a-GISEL-NEXT: global_store_short v2, v0, s[8:9]
+; GFX90a-GISEL-NEXT: global_store_short v2, v1, s[12:13]
; GFX90a-GISEL-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
@@ -2435,7 +2353,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
;
; GFX90a-SDAG-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-SDAG-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB32_0
; GFX90a-SDAG-NEXT: .p2align 8
@@ -2445,14 +2363,14 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-SDAG-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-SDAG-NEXT: s_add_i32 s2, s8, s2
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-GISEL-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB32_0
; GFX90a-GISEL-NEXT: .p2align 8
@@ -2462,7 +2380,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-GISEL-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-GISEL-NEXT: s_add_i32 s2, s8, s2
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90a-GISEL-NEXT: s_endpgm
@@ -2504,32 +2422,30 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
;
; GFX90a-SDAG-LABEL: ptr1_i8_trailing_unused:
; GFX90a-SDAG: ; %bb.1:
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-SDAG-NEXT: s_branch .LBB33_0
; GFX90a-SDAG-NEXT: .p2align 8
; GFX90a-SDAG-NEXT: ; %bb.2:
; GFX90a-SDAG-NEXT: .LBB33_0:
-; GFX90a-SDAG-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-SDAG-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-SDAG-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-SDAG-NEXT: s_endpgm
;
; GFX90a-GISEL-LABEL: ptr1_i8_trailing_unused:
; GFX90a-GISEL: ; %bb.1:
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-GISEL-NEXT: s_branch .LBB33_0
; GFX90a-GISEL-NEXT: .p2align 8
; GFX90a-GISEL-NEXT: ; %bb.2:
; GFX90a-GISEL-NEXT: .LBB33_0:
-; GFX90a-GISEL-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-GISEL-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX90a-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-GISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX90a-GISEL-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
More information about the llvm-commits
mailing list