[llvm] AMDGPU/GlobalISel: Add regbanklegalize rules for load and store (PR #153176)
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 09:21:58 PDT 2025
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/153176
>From c9c95ddc5bbd184f11532f42c2acb3899fbb51ce Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Tue, 12 Aug 2025 15:24:37 +0200
Subject: [PATCH] AMDGPU/GlobalISel: Add regbanklegalize rules for load and
store
Cover all the missing cases and add very detailed tests for each rule.
In summary:
- Flat and Scratch, addrspace(0) and addrspace(5), loads are always
divergent.
- Global and Constant, addrspace(1) and addrspace(4), have real uniform
loads, s_load, but require additional checks for align and flags in mmo.
For not natural align or not uniform mmo do uniform-in-vgpr lowering.
- Private, addrspace(3), only has instructions for divergent load, for
uniform do uniform-in-vgpr lowering.
- Store rules are simplified using Ptr32.
Operand to be stored needs to be vgpr.
- Store for GFX7 and older supports buffer_store patterns:
- divergent addrspace(1) -> buffer_store addr64
- uniform addrspace(1) -> buffer_store offset
- addrspace(5) -> buffer_store offen
Some tests have code size regression since they use more sgpr instructions,
marked with FixMe comment to get back to later.
---
.../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 42 +
.../AMDGPU/AMDGPURegBankLegalizeHelper.h | 2 +
.../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 222 +-
.../AMDGPU/AMDGPURegBankLegalizeRules.h | 2 +
.../AMDGPU/GlobalISel/atomic_load_flat.ll | 6 +-
.../AMDGPU/GlobalISel/atomic_load_global.ll | 8 +-
.../AMDGPU/GlobalISel/atomic_load_local.ll | 4 +-
.../AMDGPU/GlobalISel/atomic_load_local_2.ll | 6 +-
.../AMDGPU/GlobalISel/atomic_store_local.ll | 4 +-
.../AMDGPU/GlobalISel/bool-legalization.ll | 20 +-
...legalization-artifact-combiner-dead-def.ll | 2 +-
.../AMDGPU/GlobalISel/combiner-crash.ll | 2 +-
.../GlobalISel/crash-stack-address-O0.ll | 2 +-
.../GlobalISel/extractelement-stack-lower.ll | 4 +-
.../AMDGPU/GlobalISel/extractelement.i128.ll | 10 +-
.../AMDGPU/GlobalISel/extractelement.i16.ll | 10 +-
.../AMDGPU/GlobalISel/extractelement.i8.ll | 10 +-
.../GlobalISel/insertelement-stack-lower.ll | 2 +-
.../AMDGPU/GlobalISel/lds-global-value.ll | 2 +-
.../GlobalISel/llvm.amdgcn.dispatch.ptr.ll | 2 +-
.../llvm.amdgcn.kernarg.segment.ptr.ll | 6 +-
.../GlobalISel/llvm.amdgcn.queue.ptr.ll | 2 +-
.../GlobalISel/llvm.amdgcn.workgroup.id.ll | 8 +-
.../AMDGPU/GlobalISel/load-constant.96.ll | 303 ++-
.../AMDGPU/GlobalISel/load-constant32bit.ll | 6 +-
.../AMDGPU/GlobalISel/load-divergent.ll | 492 ++++
.../AMDGPU/GlobalISel/load-local.128.ll | 10 +-
.../AMDGPU/GlobalISel/load-local.96.ll | 10 +-
.../AMDGPU/GlobalISel/load-unaligned.ll | 80 +-
.../AMDGPU/GlobalISel/load-uniform-in-vgpr.ll | 2224 ++++++++++++++++-
.../CodeGen/AMDGPU/GlobalISel/load-uniform.ll | 602 +++++
.../load-zero-and-sign-extending-divergent.ll | 302 +++
...zero-and-sign-extending-uniform-in-vgpr.ll | 608 +++++
.../load-zero-and-sign-extending-uniform.ll | 231 ++
.../AMDGPU/GlobalISel/merge-buffer-stores.ll | 2 +-
.../GlobalISel/readanylane-combines.mir | 18 +-
.../AMDGPU/GlobalISel/regbankselect-load.mir | 120 +-
.../GlobalISel/regbankselect-sextload.mir | 21 +-
.../regbankselect-uniform-load-noclobber.mir | 162 +-
.../regbankselect-widen-scalar-loads.mir | 171 +-
.../GlobalISel/regbankselect-zextload.mir | 12 +-
.../AMDGPU/GlobalISel/regbankselect.mir | 22 +-
.../AMDGPU/GlobalISel/shufflevector.ll | 2 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll | 10 +-
.../AMDGPU/GlobalISel/store-divergent-addr.ll | 429 ++++
.../AMDGPU/GlobalISel/store-local.128.ll | 10 +-
.../AMDGPU/GlobalISel/store-local.96.ll | 10 +-
.../AMDGPU/GlobalISel/store-uniform-addr.ll | 105 +
.../AMDGPU/GlobalISel/unsupported-load.ll | 2 +-
.../GlobalISel/widen-i8-i16-scalar-loads.ll | 32 +-
.../CodeGen/AMDGPU/GlobalISel/zextload.ll | 10 +-
llvm/test/CodeGen/AMDGPU/ds-alignment.ll | 508 ++--
llvm/test/CodeGen/AMDGPU/lds-size.ll | 2 +-
.../llvm.amdgcn.addrspacecast.nonnull.ll | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll | 2 +-
.../AMDGPU/load-range-metadata-sign-bits.ll | 2 +-
llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 12 +-
.../CodeGen/AMDGPU/offset-split-global.ll | 12 +-
llvm/test/CodeGen/AMDGPU/read_register.ll | 2 +-
.../CodeGen/AMDGPU/scratch-pointer-sink.ll | 2 +-
llvm/test/CodeGen/AMDGPU/trap.ll | 18 +-
.../AMDGPU/workgroup-id-in-arch-sgprs.ll | 4 +-
62 files changed, 6164 insertions(+), 786 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index b45627d9c1c5d..73b2660727342 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -352,6 +352,32 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
MI.eraseFromParent();
}
+void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
+ Register Dst = MI.getDstReg();
+ Register Ptr = MI.getPointerReg();
+ MachineMemOperand &MMO = MI.getMMO();
+ unsigned MemSize = 8 * MMO.getSize().getValue();
+
+ MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
+
+ if (MI.getOpcode() == G_LOAD) {
+ B.buildLoad(Dst, Ptr, *WideMMO);
+ } else {
+ auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
+
+ if (MI.getOpcode() == G_ZEXTLOAD) {
+ APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
+ auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
+ B.buildAnd(Dst, Load, MaskCst);
+ } else {
+ assert(MI.getOpcode() == G_SEXTLOAD);
+ B.buildSExtInReg(Dst, Load, MemSize);
+ }
+ }
+
+ MI.eraseFromParent();
+}
+
void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
@@ -744,6 +770,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
break;
}
+ case WidenMMOToS32:
+ return widenMMOToS32(cast<GAnyLoad>(MI));
}
if (!WaterfallSgprs.empty()) {
@@ -759,6 +787,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(1);
case Sgpr16:
case Vgpr16:
+ case UniInVgprS16:
return LLT::scalar(16);
case Sgpr32:
case Sgpr32_WF:
@@ -895,6 +924,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB256:
case SgprB512:
case UniInVcc:
+ case UniInVgprS16:
case UniInVgprS32:
case UniInVgprV2S16:
case UniInVgprV4S32:
@@ -1015,6 +1045,18 @@ void RegBankLegalizeHelper::applyMappingDst(
B.buildTrunc(Reg, CopyS32_Vcc);
break;
}
+ case UniInVgprS16: {
+ assert(Ty == getTyFromID(MethodIDs[OpIdx]));
+ assert(RB == SgprRB);
+ Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
+ Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
+ Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
+ Op.setReg(NewVgprDstS16);
+ B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
+ buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
+ B.buildTrunc(Reg, NewSgprDstS32);
+ break;
+ }
case UniInVgprS32:
case UniInVgprV2S16:
case UniInVgprV4S32: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index db965d8c000d9..7affe5ab3da7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -11,6 +11,7 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@@ -107,6 +108,7 @@ class RegBankLegalizeHelper {
void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
LLT MergeTy = LLT());
void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
+ void widenMMOToS32(GAnyLoad &MI) const;
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 8c56c21621121..0776d14a84067 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -467,6 +467,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
: ST(&_ST), MRI(&_MRI) {
addRulesForGOpcs({G_ADD, G_SUB}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
@@ -615,8 +617,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
.Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}});
- bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
+ bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
+ bool usesTrue16 = ST->useRealTrue16Insts();
Predicate isAlign16([](const MachineInstr &MI) -> bool {
return (*MI.memoperands_begin())->getAlign() >= Align(16);
@@ -654,54 +657,187 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
return (*MI.memoperands_begin())->getFlags() & MONoClobber;
});
- Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool {
+ Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return MMO->getAlign() >= Align(MMO->getSize().getValue());
+ });
+
+ Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
const MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned MemSize = 8 * MMO->getSize().getValue();
- return (MemSize == 16 && MMO->getAlign() >= Align(2)) ||
- (MemSize == 8 && MMO->getAlign() >= Align(1));
+ return MemSize == 16 || MemSize == 8;
+ });
+
+ Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return 8 * MMO->getSize().getValue() == 32;
});
auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
(isConst || isInvMMO || isNoClobberMMO);
// clang-format off
+ // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
addRulesForGOpcs({G_LOAD})
- .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
- .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
-
- .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
- .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
- .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
- .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
- .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
- .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
- .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
- .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
- .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
-
- .Any({{DivB32, UniP3}, {{VgprB32}, {VgprP3}}})
- .Any({{{UniB32, UniP3}, isAlign4 && isUL}, {{SgprB32}, {SgprP3}}})
- .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
-
- .Any({{{DivB256, DivP4}}, {{VgprB256}, {VgprP4}, SplitLoad}})
- .Any({{{UniB32, UniP4}, isNaturalAlignedSmall && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) // i8 and i16 load
- .Any({{{UniB32, UniP4}, isAlign4 && isUL}, {{SgprB32}, {SgprP4}}})
- .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
- .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
- .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
- .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
- .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
- .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
- .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
- .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}})
- .Any({{{UniB256, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP4}, SplitLoad}})
- .Any({{{UniB512, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP4}, SplitLoad}})
-
- .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}});
-
- addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads
- .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
- .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
+ // flat, addrspace(0), never uniform - flat_load
+ .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
+ .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
+ .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
+ .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
+
+ // global, addrspace(1)
+ // divergent - global_load
+ .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
+ .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
+ .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
+ .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
+ .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
+ .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
+
+ // uniform - s_load
+ .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
+ .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
+ // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
+ .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
+ .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
+ .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
+ .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
+ .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
+ .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
+ .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
+
+ // Uniform via global or buffer load, for example volatile or non-aligned
+ // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
+ // selected as global_load, use SgprP1 for pointer instead to match
+ // patterns without flat-for-global, default for GFX7 and older.
+ // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
+ // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
+ .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
+ .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
+ .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
+ .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
+ .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
+ .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
+
+ // local, addrspace(3) - ds_load
+ .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
+ .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
+ .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
+ .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
+
+ .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
+ .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
+ .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
+ .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
+
+ // constant, addrspace(4)
+ // divergent - global_load
+ .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
+ .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
+ .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
+ .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
+ .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
+ .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
+
+ // uniform - s_load
+ .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
+ .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
+ .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
+ .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
+ .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
+ .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
+ .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
+ .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
+
+ // uniform in vgpr - global_load or buffer_load
+ .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
+ .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
+ .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
+ .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
+ .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
+ .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
+
+ // private, addrspace(5), never uniform - scratch_load
+ .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
+ .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
+ .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
+ .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
+ .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
+
+ .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
+
+
+ addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
+ .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
+
+ .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
+ .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
+ .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
+ .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
+ .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
+
+ .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
+ .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
+
+ .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
+ .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
+ .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
+ .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
+ .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
+
+ .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}});
+
+ addRulesForGOpcs({G_STORE})
+ // addrspace(0)
+ .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
+ .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
+ .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
+ .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
+ .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
+
+ // addrspace(1), there are no stores to addrspace(4)
+ // For targets:
+ // - with "+flat-for-global" - global_store
+ // - without(-flat-for-global) - buffer_store addr64
+ .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
+ .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
+ .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
+ .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
+ .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
+
+ // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
+ // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
+ // - without(-flat-for-global) - need sgpr ptr to select buffer_store
+ .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
+ .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
+ .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
+ .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
+ .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
+
+ // addrspace(3) and addrspace(5)
+ .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
+ .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
+ .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
+ .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
+ .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
// clang-format on
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
@@ -714,12 +850,6 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
.Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
- addRulesForGOpcs({G_STORE})
- .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
- .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
- .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
- .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
-
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 13914403c439e..d0c69105356b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -176,6 +176,7 @@ enum RegBankLLTMappingApplyID {
// Dst only modifiers: read-any-lane and truncs
UniInVcc,
+ UniInVgprS16,
UniInVgprS32,
UniInVgprV2S16,
UniInVgprV4S32,
@@ -221,6 +222,7 @@ enum LoweringMethodID {
UniCstExt,
SplitLoad,
WidenLoad,
+ WidenMMOToS32
};
enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
index 83912b1e77db2..97694f3304431 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
index e2906c3d4fdb2..5d902d5ec98ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) {
; GFX6-LABEL: atomic_load_global_monotonic_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll
index 70cd96338a0c9..c1dbf91aa9086 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}atomic_load_monotonic_i8:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
index 1656814d6fb06..31cdbbe1c4d73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; TODO: Merge with atomic_load_local.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll
index dea42d62ec2d4..76850f0c0db17 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}atomic_store_monotonic_i8:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index 18895f7867369..358ecd8fce3a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
; End to end tests for scalar vs. vector boolean legalization strategies.
define amdgpu_ps float @select_vgpr_sgpr_trunc_cond(i32 inreg %a, i32 %b, i32 %c) {
; WAVE64-LABEL: select_vgpr_sgpr_trunc_cond:
; WAVE64: ; %bb.0:
-; WAVE64-NEXT: s_and_b32 s0, 1, s0
-; WAVE64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
+; WAVE64-NEXT: s_cselect_b64 vcc, exec, 0
; WAVE64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; WAVE64-NEXT: ; return to shader part epilog
;
; WAVE32-LABEL: select_vgpr_sgpr_trunc_cond:
; WAVE32: ; %bb.0:
-; WAVE32-NEXT: s_and_b32 s0, 1, s0
-; WAVE32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
+; WAVE32-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0
; WAVE32-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; WAVE32-NEXT: ; return to shader part epilog
%cc = trunc i32 %a to i1
@@ -28,16 +28,16 @@ define amdgpu_ps float @select_vgpr_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inre
; WAVE64-LABEL: select_vgpr_sgpr_trunc_and_cond:
; WAVE64: ; %bb.0:
; WAVE64-NEXT: s_and_b32 s0, s0, s1
-; WAVE64-NEXT: s_and_b32 s0, 1, s0
-; WAVE64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
+; WAVE64-NEXT: s_cselect_b64 vcc, exec, 0
; WAVE64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; WAVE64-NEXT: ; return to shader part epilog
;
; WAVE32-LABEL: select_vgpr_sgpr_trunc_and_cond:
; WAVE32: ; %bb.0:
; WAVE32-NEXT: s_and_b32 s0, s0, s1
-; WAVE32-NEXT: s_and_b32 s0, 1, s0
-; WAVE32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
+; WAVE32-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0
; WAVE32-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; WAVE32-NEXT: ; return to shader part epilog
%cc.0 = trunc i32 %a.0 to i1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll
index d317a3ef54162..a79e471b1b5bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define void @value_finder_bug(ptr addrspace(5) %store_ptr, ptr addrspace(4) %ptr) {
; GFX10-LABEL: value_finder_bug:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll
index 356ef52bf21b6..e1ae61be5a66b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s
+; RUN: llc -O0 -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s
define amdgpu_kernel void @test_long_add4() {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 8efe711c9267c..b153ff06b727e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s | FileCheck %s
; Make sure there's no crash at -O0 when matching MUBUF addressing
; modes for the stack.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 4fc0488ec60cf..990e4f67e420d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
; Check lowering of some large extractelement that use the stack
; instead of register indexing.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 3605daef17bd3..405861d791169 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index e4acee9ddda7e..798f6eb65e6aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index ac17dde1f9aa7..de1079196223a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 3e1602625f197..c2129c20e4543 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; Check lowering of some large insertelement that use the stack
; instead of register indexing.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index e4135fae40006..7fd981c3f3fc6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
; TODO: Replace with existing DAG tests
@lds_512_4 = internal unnamed_addr addrspace(3) global [128 x i32] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
index 79760ce4a2a0b..5f529f5a3caaf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
; FIXME: Error on non-HSA target
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
index ee9cf0b84868f..d37ade73daf5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,6 +1,6 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s
; ALL-LABEL: {{^}}test:
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
index 0467547e55374..eecd9ae13912e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
; FIXME: Error on non-hsa target
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll
index b2546700a935d..f491df8448a7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll
@@ -1,8 +1,8 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s
declare i32 @llvm.amdgcn.workgroup.id.x() #0
declare i32 @llvm.amdgcn.workgroup.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index ab8d8c192187f..41fda6de82181 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -1000,32 +1000,50 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:11
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:11
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 8, v1
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 24, v4
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v4
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v5
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v6, 8, v5
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v6
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s6, v7
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v8
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s7, v8
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s8, v9
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v9
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s9, v10
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v11
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s10, v11
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v6
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s11, v12
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 24
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 8
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s7, s7, 24
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s6, s6, 16
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s9, s9, 8
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s11, s11, 24
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s10, s10, 16
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s3, s7, s6
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s4, s9, s8
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s5, s11, s10
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -1043,102 +1061,120 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX7-NOUNALIGNED: ; %bb.0:
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:3
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:5
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:7
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:6
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:9
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:10
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:4
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:8
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s6, v6
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s7, v7
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s8, v8
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v9
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s9, v9
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v3, v10
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s10, v10
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v6, v11
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v5, v7, v8
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v2
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v4
-; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
-; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s11, v11
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 8
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 24
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s2, s2, 16
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 8
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s7, s7, 24
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s6, s6, 16
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s9, s9, 8
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s11, s11, 24
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s10, s10, 16
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s3, s7, s6
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s4, s9, s8
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s5, s11, s10
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX6-LABEL: s_load_constant_v3i32_align1:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1
-; GFX6-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:3
+; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; GFX6-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1
; GFX6-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2
-; GFX6-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:5
-; GFX6-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:7
-; GFX6-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:6
-; GFX6-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:9
-; GFX6-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11
-; GFX6-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:10
-; GFX6-NEXT: buffer_load_ubyte v9, off, s[0:3], 0
-; GFX6-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:4
-; GFX6-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:8
+; GFX6-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3
+; GFX6-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4
+; GFX6-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5
+; GFX6-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6
+; GFX6-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7
+; GFX6-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8
+; GFX6-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9
+; GFX6-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10
+; GFX6-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11
; GFX6-NEXT: s_waitcnt vmcnt(11)
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: s_waitcnt vmcnt(10)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
; GFX6-NEXT: s_waitcnt vmcnt(9)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_readfirstlane_b32 s2, v2
; GFX6-NEXT: s_waitcnt vmcnt(8)
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT: v_readfirstlane_b32 s3, v3
; GFX6-NEXT: s_waitcnt vmcnt(7)
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX6-NEXT: v_readfirstlane_b32 s4, v4
; GFX6-NEXT: s_waitcnt vmcnt(6)
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_readfirstlane_b32 s5, v5
; GFX6-NEXT: s_waitcnt vmcnt(5)
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-NEXT: v_readfirstlane_b32 s6, v6
; GFX6-NEXT: s_waitcnt vmcnt(4)
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; GFX6-NEXT: v_readfirstlane_b32 s7, v7
; GFX6-NEXT: s_waitcnt vmcnt(3)
-; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX6-NEXT: v_readfirstlane_b32 s8, v8
; GFX6-NEXT: s_waitcnt vmcnt(2)
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v9
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v9
; GFX6-NEXT: s_waitcnt vmcnt(1)
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v10
-; GFX6-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX6-NEXT: v_readfirstlane_b32 s10, v10
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v11
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v8
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v11
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
+; GFX6-NEXT: s_lshl_b32 s3, s3, 24
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_lshl_b32 s5, s5, 8
+; GFX6-NEXT: s_lshl_b32 s7, s7, 24
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_lshl_b32 s9, s9, 8
+; GFX6-NEXT: s_lshl_b32 s11, s11, 24
+; GFX6-NEXT: s_lshl_b32 s10, s10, 16
+; GFX6-NEXT: s_or_b32 s0, s1, s0
+; GFX6-NEXT: s_or_b32 s1, s3, s2
+; GFX6-NEXT: s_or_b32 s2, s5, s4
+; GFX6-NEXT: s_or_b32 s3, s7, s6
+; GFX6-NEXT: s_or_b32 s4, s9, s8
+; GFX6-NEXT: s_or_b32 s5, s11, s10
+; GFX6-NEXT: s_or_b32 s0, s1, s0
+; GFX6-NEXT: s_or_b32 s1, s3, s2
+; GFX6-NEXT: s_or_b32 s2, s5, s4
; GFX6-NEXT: ; return to shader part epilog
%load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
ret <3 x i32> %load
@@ -1220,15 +1256,24 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] offset:10
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 16, v1
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v4
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v5
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v6
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 16
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 16
+; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 16
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX9-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
@@ -1246,54 +1291,60 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX7-NOUNALIGNED: ; %bb.0:
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:10
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v5
-; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
-; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v5
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 16
+; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 16
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
+; GFX7-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX6-LABEL: s_load_constant_v3i32_align2:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
-; GFX6-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6
-; GFX6-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:10
-; GFX6-NEXT: buffer_load_ushort v3, off, s[0:3], 0
-; GFX6-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4
-; GFX6-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8
+; GFX6-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; GFX6-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2
+; GFX6-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4
+; GFX6-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6
+; GFX6-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8
+; GFX6-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10
; GFX6-NEXT: s_waitcnt vmcnt(5)
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: s_waitcnt vmcnt(4)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
; GFX6-NEXT: s_waitcnt vmcnt(3)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_readfirstlane_b32 s2, v2
; GFX6-NEXT: s_waitcnt vmcnt(2)
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_readfirstlane_b32 s3, v3
; GFX6-NEXT: s_waitcnt vmcnt(1)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT: v_readfirstlane_b32 s4, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v5
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
+; GFX6-NEXT: v_readfirstlane_b32 s5, v5
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_or_b32 s0, s1, s0
+; GFX6-NEXT: s_or_b32 s1, s3, s2
+; GFX6-NEXT: s_or_b32 s2, s5, s4
; GFX6-NEXT: ; return to shader part epilog
%load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
index 44e4320cddb22..0038a097174c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; FIXME: Test should be redundant with constant-address-space-32bit.ll
; It's important to check with gfx8 and gfx9 to check access through global and flat.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll
new file mode 100644
index 0000000000000..197133441d3a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll
@@ -0,0 +1,492 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode,-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode,+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s
+
+define amdgpu_ps void @load_divergent_P0_i8_any_extending(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_divergent_P0_i8_any_extending:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_u8 v0, v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[2:3], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ store i8 %a, ptr addrspace(0) %out
+ ret void
+}
+
+; with true16, S16 16-bit load
+; without true16, S32 16-bit any-extending load
+define amdgpu_ps void @load_divergent_P0_i16(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-True16-LABEL: load_divergent_P0_i16:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: flat_load_u16 v0, v[0:1]
+; GFX12-True16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-True16-NEXT: flat_store_b16 v[2:3], v0
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_divergent_P0_i16:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: flat_load_d16_b16 v0, v[0:1]
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NoTrue16-NEXT: flat_store_b16 v[2:3], v0
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ store i16 %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P0_i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_divergent_P0_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_b32 v0, v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(0) %ptra
+ store i32 %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P0_v2i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_divergent_P0_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(0) %ptra
+ store <2 x i32> %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P0_v3i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_divergent_P0_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_b96 v[4:6], v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b96 v[2:3], v[4:6]
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(0) %ptra
+ store <3 x i32> %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P0_v4i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_divergent_P0_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_b128 v[4:7], v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b128 v[2:3], v[4:7]
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(0) %ptra
+ store <4 x i32> %a, ptr addrspace(0) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @load_divergent_P1_i8_any_extending(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_i8_any_extending:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; with true16, S16 16-bit load
+; without true16, S32 16-bit any-extending load
+define amdgpu_ps void @load_divergent_P1_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-True16-LABEL: load_divergent_P1_i16:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: global_load_u16 v0, v[0:1], off
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: global_store_b16 v[2:3], v0, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_divergent_P1_i16:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[2:3], v0, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P1_i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(1) %ptra
+ store i32 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P1_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(1) %ptra
+ store <2 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P1_v3i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b96 v[4:6], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b96 v[2:3], v[4:6], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(1) %ptra
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P1_v4i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(1) %ptra
+ store <4 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P1_v8i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: s_endpgm
+ %a = load <8 x i32>, ptr addrspace(1) %ptra
+ store <8 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P1_v16i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P1_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX12-NEXT: global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX12-NEXT: global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX12-NEXT: s_wait_loadcnt 0x3
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX12-NEXT: s_endpgm
+ %a = load <16 x i32>, ptr addrspace(1) %ptra
+ store <16 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_divergent_P3_i8_any_extending:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u8 v0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b8 v1, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ store i8 %a, ptr addrspace(3) %out
+ ret void
+}
+
+; with true16, S16 16-bit load
+; without true16, S32 16-bit any-extending load
+define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-True16-LABEL: load_divergent_P3_i16:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: ds_load_u16 v0, v0
+; GFX12-True16-NEXT: s_wait_dscnt 0x0
+; GFX12-True16-NEXT: ds_store_b16 v1, v0
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_divergent_P3_i16:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: ds_load_u16_d16 v0, v0
+; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0
+; GFX12-NoTrue16-NEXT: ds_store_b16 v1, v0
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ store i16 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_divergent_P3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_b32 v0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v1, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(3) %ptra
+ store i32 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_divergent_P3_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_b64 v[2:3], v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b64 v1, v[2:3]
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(3) %ptra
+ store <2 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_divergent_P3_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_b96 v[2:4], v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b96 v1, v[2:4]
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(3) %ptra
+ store <3 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_divergent_P3_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_b128 v[2:5], v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b128 v1, v[2:5]
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(3) %ptra
+ store <4 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @load_divergent_P4_i8_any_extending(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_i8_any_extending:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; with true16, S16 16-bit load
+; without true16, S32 16-bit any-extending load
+define amdgpu_ps void @load_divergent_P4_i16(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-True16-LABEL: load_divergent_P4_i16:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: global_load_u16 v0, v[0:1], off
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: global_store_b16 v[2:3], v0, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_divergent_P4_i16:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[2:3], v0, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P4_i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(4) %ptra
+ store i32 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P4_v2i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(4) %ptra
+ store <2 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P4_v3i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b96 v[4:6], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b96 v[2:3], v[4:6], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(4) %ptra
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P4_v4i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(4) %ptra
+ store <4 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P4_v8i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: s_endpgm
+ %a = load <8 x i32>, ptr addrspace(4) %ptra
+ store <8 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P4_v16i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_divergent_P4_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX12-NEXT: global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX12-NEXT: global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX12-NEXT: s_wait_loadcnt 0x3
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX12-NEXT: s_endpgm
+ %a = load <16 x i32>, ptr addrspace(4) %ptra
+ store <16 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @load_divergent_P5_i8_any_extending(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_divergent_P5_i8_any_extending:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_u8 v0, v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b8 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ store i8 %a, ptr addrspace(5) %out
+ ret void
+}
+
+; with true16, S16 16-bit load
+; without true16, S32 16-bit any-extending load
+define amdgpu_ps void @load_divergent_P5_i16(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-True16-LABEL: load_divergent_P5_i16:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: scratch_load_u16 v0, v0, off
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: scratch_store_b16 v1, v0, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_divergent_P5_i16:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: scratch_load_d16_b16 v0, v0, off
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: scratch_store_b16 v1, v0, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(5) %ptra
+ store i16 %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P5_i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_divergent_P5_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_b32 v0, v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(5) %ptra
+ store i32 %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P5_v2i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_divergent_P5_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_b64 v[2:3], v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b64 v1, v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(5) %ptra
+ store <2 x i32> %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P5_v3i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_divergent_P5_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_b96 v[2:4], v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b96 v1, v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(5) %ptra
+ store <3 x i32> %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P5_v4i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_divergent_P5_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_b128 v[2:5], v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b128 v1, v[2:5], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(5) %ptra
+ store <4 x i32> %a, ptr addrspace(5) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index caaface376f21..d7fcbd5d623c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
; GFX9-LABEL: load_lds_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index cbfdfd3286884..191f2e0670e15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
; GFX9-LABEL: load_lds_v3i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index ed248b450582c..b1de0eff05d30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
+
+; FixMe: need merge/unmerge artifact combine
; Unaligned DS access in available from GFX9 onwards.
; LDS alignment enforcement is controlled by a configuration register:
@@ -283,8 +285,24 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg
; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s8, v4
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_readfirstlane_b32 s9, v5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v6
+; GFX9-NEXT: v_readfirstlane_b32 s11, v7
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-NEXT: v_mov_b32_e32 v7, s11
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
; GFX9-NEXT: s_endpgm
;
@@ -298,8 +316,24 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s8, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v5
+; GFX7-NEXT: v_readfirstlane_b32 s10, v6
+; GFX7-NEXT: v_readfirstlane_b32 s11, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: v_mov_b32_e32 v3, s3
+; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: v_mov_b32_e32 v5, s9
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: v_mov_b32_e32 v7, s11
; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX7-NEXT: s_endpgm
;
@@ -310,8 +344,24 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg
; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s8, v4
+; GFX10-NEXT: v_readfirstlane_b32 s9, v5
+; GFX10-NEXT: v_readfirstlane_b32 s10, v6
+; GFX10-NEXT: v_readfirstlane_b32 s11, v7
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: v_mov_b32_e32 v5, s9
+; GFX10-NEXT: v_mov_b32_e32 v6, s10
+; GFX10-NEXT: v_mov_b32_e32 v7, s11
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
; GFX10-NEXT: s_endpgm
;
@@ -322,8 +372,22 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg
; GFX11-NEXT: global_load_b128 v[0:3], v8, s[0:1]
; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s8, v4
+; GFX11-NEXT: v_readfirstlane_b32 s9, v5
+; GFX11-NEXT: v_readfirstlane_b32 s10, v6
+; GFX11-NEXT: v_readfirstlane_b32 s11, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16
; GFX11-NEXT: s_endpgm
%load = load <8 x i32>, ptr addrspace(4) %ptr, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
index 92e532b6cf340..4361e5c113708 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
@@ -1,95 +1,2135 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck %s
-
-define amdgpu_ps void @uniform_load_i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1, ptr addrspace(1) inreg %ptr2) {
-; CHECK-LABEL: uniform_load_i32:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_load_dword v2, v0, s[2:3]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s1, v2
-; CHECK-NEXT: s_add_i32 s0, s0, s1
-; CHECK-NEXT: v_mov_b32_e32 v1, s0
-; CHECK-NEXT: global_store_dword v0, v1, s[4:5]
-; CHECK-NEXT: s_endpgm
- %load0 = load volatile i32, ptr addrspace(1) %ptr0
- %load1 = load i32, ptr addrspace(1) %ptr1, align 1
- %sum = add i32 %load0, %load1
- store i32 %sum, ptr addrspace(1) %ptr2
- ret void
-}
-
-define amdgpu_ps void @uniform_load_v2i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
-; CHECK-LABEL: uniform_load_v2i32:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: s_add_i32 s0, s0, s1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
-; CHECK-NEXT: s_endpgm
- %load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
- %elt0 = extractelement <2 x i32> %load, i32 0
- %elt1 = extractelement <2 x i32> %load, i32 1
- %sum = add i32 %elt0, %elt1
- store i32 %sum, ptr addrspace(1) %ptr1
- ret void
-}
-
-define amdgpu_ps void @uniform_load_v3i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
-; CHECK-LABEL: uniform_load_v3i32:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v3, 0
-; CHECK-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-NEXT: s_add_i32 s0, s0, s1
-; CHECK-NEXT: s_add_i32 s0, s0, s4
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: global_store_dword v3, v0, s[2:3]
-; CHECK-NEXT: s_endpgm
- %load = load <3 x i32>, ptr addrspace(1) %ptr0, align 2
- %elt0 = extractelement <3 x i32> %load, i32 0
- %elt1 = extractelement <3 x i32> %load, i32 1
- %elt2 = extractelement <3 x i32> %load, i32 2
- %sum0 = add i32 %elt0, %elt1
- %sum = add i32 %sum0, %elt2
- store i32 %sum, ptr addrspace(1) %ptr1
- ret void
-}
-
-define amdgpu_ps void @uniform_load_v4i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
-; CHECK-LABEL: uniform_load_v4i32:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] glc dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-NEXT: s_add_i32 s0, s0, s1
-; CHECK-NEXT: s_add_i32 s0, s0, s4
-; CHECK-NEXT: s_add_i32 s0, s0, s5
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: global_store_dword v4, v0, s[2:3]
-; CHECK-NEXT: s_endpgm
- %load = load volatile <4 x i32>, ptr addrspace(1) %ptr0
- %elt0 = extractelement <4 x i32> %load, i32 0
- %elt1 = extractelement <4 x i32> %load, i32 1
- %elt2 = extractelement <4 x i32> %load, i32 2
- %elt3 = extractelement <4 x i32> %load, i32 3
- %sum0 = add i32 %elt0, %elt1
- %sum1 = add i32 %sum0, %elt2
- %sum = add i32 %sum1, %elt3
- store i32 %sum, ptr addrspace(1) %ptr1
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11,GFX11-True16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-NoTrue16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s
+
+; global address space, addrspace(1)
+
+; gfx12 true 16, not natural alignment or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_i16_b16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_i16_b16_gfx12:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s4, s2
+; GFX7-NEXT: s_mov_b32 s5, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx12:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: s_clause 0x1
+; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: s_clause 0x1
+; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx12:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-True16-NEXT: s_clause 0x1
+; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NoTrue16-NEXT: s_clause 0x1
+; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 1
+ %b = load volatile i16, ptr addrspace(1) %ptrb
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11 true16, 16-bit load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_i16_b16_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_i16_b16_gfx11:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx11:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: s_clause 0x1
+; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: s_clause 0x1
+; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx11:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %b = load volatile i16, ptr addrspace(1) %ptra, align 4
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx12 without true16, 16-bit any-extending load, not natural alignment or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx12:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: s_clause 0x1
+; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: s_clause 0x1
+; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-True16-NEXT: s_clause 0x1
+; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NoTrue16-NEXT: s_clause 0x1
+; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 1
+ %b = load volatile i16, ptr addrspace(1) %ptra
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11(or older) without true 16, s16 any-extending load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx11:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: s_clause 0x1
+; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: s_clause 0x1
+; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %b = load volatile i16, ptr addrspace(1) %ptra, align 4
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 32-bit load load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P1_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b32 v3, v2, s[0:1]
+; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(1) %ptra, align 2
+ %b = load volatile i32, ptr addrspace(1) %ptra
+ %sum = add i32 %a, %b
+ store i32 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 64bit load load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: v_readfirstlane_b32 s5, v5
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7-NEXT: s_add_i32 s1, s1, s5
+; GFX7-NEXT: s_add_i32 s0, s0, s4
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P1_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1]
+; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: v_readfirstlane_b32 s3, v5
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: s_add_i32 s1, s1, s3
+; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1]
+; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s3, v5
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-NEXT: s_add_co_i32 s1, s1, s3
+; GFX12-NEXT: s_add_co_i32 s0, s0, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(1) %ptra, align 2
+ %b = load volatile <2 x i32>, ptr addrspace(1) %ptra
+ %sum = add <2 x i32> %a, %b
+ store <2 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 96bit load load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_v3i32_gfx12:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx3 v[5:7], off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s4, v5
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: v_readfirstlane_b32 s6, v4
+; GFX7-NEXT: v_readfirstlane_b32 s5, v6
+; GFX7-NEXT: v_readfirstlane_b32 s7, v7
+; GFX7-NEXT: s_add_i32 s4, s0, s4
+; GFX7-NEXT: s_add_i32 s5, s1, s5
+; GFX7-NEXT: s_add_i32 s6, s6, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P1_v3i32_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1]
+; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: v_readfirstlane_b32 s5, v7
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: v_readfirstlane_b32 s3, v5
+; GFX11-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-NEXT: s_add_i32 s2, s2, s5
+; GFX11-NEXT: s_add_i32 s0, s0, s3
+; GFX11-NEXT: s_add_i32 s1, s1, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v3i32_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v5, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1]
+; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-NEXT: v_readfirstlane_b32 s5, v7
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s3, v5
+; GFX12-NEXT: v_readfirstlane_b32 s4, v6
+; GFX12-NEXT: s_add_co_i32 s2, s2, s5
+; GFX12-NEXT: s_add_co_i32 s0, s0, s3
+; GFX12-NEXT: s_add_co_i32 s1, s1, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(1) %ptra, align 2
+ %b = load volatile <3 x i32>, ptr addrspace(1) %ptra
+ %sum = add <3 x i32> %a, %b
+ store <3 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 128-bit load load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s4, v6
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: v_readfirstlane_b32 s6, v4
+; GFX7-NEXT: v_readfirstlane_b32 s7, v5
+; GFX7-NEXT: v_readfirstlane_b32 s5, v7
+; GFX7-NEXT: v_readfirstlane_b32 s8, v8
+; GFX7-NEXT: v_readfirstlane_b32 s9, v9
+; GFX7-NEXT: s_add_i32 s4, s0, s4
+; GFX7-NEXT: s_add_i32 s5, s1, s5
+; GFX7-NEXT: s_add_i32 s6, s6, s8
+; GFX7-NEXT: s_add_i32 s7, s7, s9
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P1_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1]
+; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v5
+; GFX11-NEXT: v_readfirstlane_b32 s7, v9
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-NEXT: v_readfirstlane_b32 s5, v7
+; GFX11-NEXT: v_readfirstlane_b32 s6, v8
+; GFX11-NEXT: s_add_i32 s3, s3, s7
+; GFX11-NEXT: s_add_i32 s0, s0, s4
+; GFX11-NEXT: s_add_i32 s1, s1, s5
+; GFX11-NEXT: s_add_i32 s2, s2, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v6, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1]
+; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s3, v5
+; GFX12-NEXT: v_readfirstlane_b32 s7, v9
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-NEXT: v_readfirstlane_b32 s4, v6
+; GFX12-NEXT: v_readfirstlane_b32 s5, v7
+; GFX12-NEXT: v_readfirstlane_b32 s6, v8
+; GFX12-NEXT: s_add_co_i32 s3, s3, s7
+; GFX12-NEXT: s_add_co_i32 s0, s0, s4
+; GFX12-NEXT: s_add_co_i32 s1, s1, s5
+; GFX12-NEXT: s_add_co_i32 s2, s2, s6
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(1) %ptra, align 2
+ %b = load volatile <4 x i32>, ptr addrspace(1) %ptra
+ %sum = add <4 x i32> %a, %b
+ store <4 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 256bit load load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_v8i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:16 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
+; GFX7-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7-NEXT: v_readfirstlane_b32 s12, v10
+; GFX7-NEXT: v_readfirstlane_b32 s6, v4
+; GFX7-NEXT: v_readfirstlane_b32 s7, v5
+; GFX7-NEXT: v_readfirstlane_b32 s8, v6
+; GFX7-NEXT: v_readfirstlane_b32 s13, v11
+; GFX7-NEXT: v_readfirstlane_b32 s14, v12
+; GFX7-NEXT: v_readfirstlane_b32 s15, v13
+; GFX7-NEXT: v_readfirstlane_b32 s16, v14
+; GFX7-NEXT: s_add_i32 s4, s4, s12
+; GFX7-NEXT: v_readfirstlane_b32 s9, v7
+; GFX7-NEXT: v_readfirstlane_b32 s10, v8
+; GFX7-NEXT: v_readfirstlane_b32 s11, v9
+; GFX7-NEXT: v_readfirstlane_b32 s17, v15
+; GFX7-NEXT: v_readfirstlane_b32 s18, v16
+; GFX7-NEXT: v_readfirstlane_b32 s19, v17
+; GFX7-NEXT: s_add_i32 s5, s5, s13
+; GFX7-NEXT: s_add_i32 s6, s6, s14
+; GFX7-NEXT: s_add_i32 s7, s7, s15
+; GFX7-NEXT: s_add_i32 s8, s8, s16
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_add_i32 s9, s9, s17
+; GFX7-NEXT: s_add_i32 s10, s10, s18
+; GFX7-NEXT: s_add_i32 s11, s11, s19
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-NEXT: v_mov_b32_e32 v6, s8
+; GFX7-NEXT: v_mov_b32_e32 v7, s9
+; GFX7-NEXT: v_mov_b32_e32 v8, s10
+; GFX7-NEXT: v_mov_b32_e32 v9, s11
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P1_v8i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v14, 0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1]
+; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
+; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v5
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s11, v13
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: v_readfirstlane_b32 s7, v9
+; GFX11-NEXT: v_readfirstlane_b32 s8, v10
+; GFX11-NEXT: v_readfirstlane_b32 s9, v11
+; GFX11-NEXT: v_readfirstlane_b32 s10, v12
+; GFX11-NEXT: v_readfirstlane_b32 s15, v17
+; GFX11-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-NEXT: v_readfirstlane_b32 s5, v7
+; GFX11-NEXT: v_readfirstlane_b32 s6, v8
+; GFX11-NEXT: v_readfirstlane_b32 s12, v14
+; GFX11-NEXT: v_readfirstlane_b32 s13, v15
+; GFX11-NEXT: v_readfirstlane_b32 s14, v16
+; GFX11-NEXT: s_add_i32 s3, s3, s11
+; GFX11-NEXT: s_add_i32 s0, s0, s8
+; GFX11-NEXT: s_add_i32 s1, s1, s9
+; GFX11-NEXT: s_add_i32 s2, s2, s10
+; GFX11-NEXT: s_add_i32 s7, s7, s15
+; GFX11-NEXT: s_add_i32 s4, s4, s12
+; GFX11-NEXT: s_add_i32 s5, s5, s13
+; GFX11-NEXT: s_add_i32 s6, s6, s14
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v14, 0
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1]
+; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
+; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s3, v5
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s11, v13
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-NEXT: v_readfirstlane_b32 s7, v9
+; GFX12-NEXT: v_readfirstlane_b32 s8, v10
+; GFX12-NEXT: v_readfirstlane_b32 s9, v11
+; GFX12-NEXT: v_readfirstlane_b32 s10, v12
+; GFX12-NEXT: v_readfirstlane_b32 s15, v17
+; GFX12-NEXT: v_readfirstlane_b32 s4, v6
+; GFX12-NEXT: v_readfirstlane_b32 s5, v7
+; GFX12-NEXT: v_readfirstlane_b32 s6, v8
+; GFX12-NEXT: v_readfirstlane_b32 s12, v14
+; GFX12-NEXT: v_readfirstlane_b32 s13, v15
+; GFX12-NEXT: v_readfirstlane_b32 s14, v16
+; GFX12-NEXT: s_add_co_i32 s3, s3, s11
+; GFX12-NEXT: s_add_co_i32 s0, s0, s8
+; GFX12-NEXT: s_add_co_i32 s1, s1, s9
+; GFX12-NEXT: s_add_co_i32 s2, s2, s10
+; GFX12-NEXT: s_add_co_i32 s7, s7, s15
+; GFX12-NEXT: s_add_co_i32 s4, s4, s12
+; GFX12-NEXT: s_add_co_i32 s5, s5, s13
+; GFX12-NEXT: s_add_co_i32 s6, s6, s14
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: s_endpgm
+ %a = load <8 x i32>, ptr addrspace(1) %ptra, align 2
+ %b = load volatile <8 x i32>, ptr addrspace(1) %ptra
+ %sum = add <8 x i32> %a, %b
+ store <8 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 512bit load load, not align 4 or not uniform mmo
+define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P1_v16i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
+; GFX7-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7-NEXT: v_readfirstlane_b32 s6, v4
+; GFX7-NEXT: v_readfirstlane_b32 s7, v5
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s8, v6
+; GFX7-NEXT: v_readfirstlane_b32 s9, v7
+; GFX7-NEXT: v_readfirstlane_b32 s10, v8
+; GFX7-NEXT: v_readfirstlane_b32 s11, v9
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s12, v10
+; GFX7-NEXT: v_readfirstlane_b32 s13, v11
+; GFX7-NEXT: v_readfirstlane_b32 s14, v12
+; GFX7-NEXT: v_readfirstlane_b32 s15, v13
+; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s16, v14
+; GFX7-NEXT: v_readfirstlane_b32 s17, v15
+; GFX7-NEXT: v_readfirstlane_b32 s18, v16
+; GFX7-NEXT: v_readfirstlane_b32 s19, v17
+; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_readfirstlane_b32 s20, v2
+; GFX7-NEXT: v_readfirstlane_b32 s21, v3
+; GFX7-NEXT: v_readfirstlane_b32 s22, v4
+; GFX7-NEXT: v_readfirstlane_b32 s23, v5
+; GFX7-NEXT: s_add_i32 s4, s4, s20
+; GFX7-NEXT: v_readfirstlane_b32 s24, v6
+; GFX7-NEXT: v_readfirstlane_b32 s25, v7
+; GFX7-NEXT: v_readfirstlane_b32 s26, v8
+; GFX7-NEXT: v_readfirstlane_b32 s27, v9
+; GFX7-NEXT: s_add_i32 s5, s5, s21
+; GFX7-NEXT: v_readfirstlane_b32 s28, v10
+; GFX7-NEXT: v_readfirstlane_b32 s29, v11
+; GFX7-NEXT: v_readfirstlane_b32 s30, v12
+; GFX7-NEXT: v_readfirstlane_b32 s31, v13
+; GFX7-NEXT: s_add_i32 s6, s6, s22
+; GFX7-NEXT: v_readfirstlane_b32 s33, v14
+; GFX7-NEXT: v_readfirstlane_b32 s34, v15
+; GFX7-NEXT: v_readfirstlane_b32 s35, v16
+; GFX7-NEXT: v_readfirstlane_b32 s36, v17
+; GFX7-NEXT: s_add_i32 s7, s7, s23
+; GFX7-NEXT: s_add_i32 s8, s8, s24
+; GFX7-NEXT: s_add_i32 s12, s12, s28
+; GFX7-NEXT: s_add_i32 s16, s16, s33
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_add_i32 s9, s9, s25
+; GFX7-NEXT: s_add_i32 s10, s10, s26
+; GFX7-NEXT: s_add_i32 s11, s11, s27
+; GFX7-NEXT: s_add_i32 s13, s13, s29
+; GFX7-NEXT: s_add_i32 s14, s14, s30
+; GFX7-NEXT: s_add_i32 s15, s15, s31
+; GFX7-NEXT: s_add_i32 s17, s17, s34
+; GFX7-NEXT: s_add_i32 s18, s18, s35
+; GFX7-NEXT: s_add_i32 s19, s19, s36
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-NEXT: v_mov_b32_e32 v6, s8
+; GFX7-NEXT: v_mov_b32_e32 v10, s12
+; GFX7-NEXT: v_mov_b32_e32 v14, s16
+; GFX7-NEXT: v_mov_b32_e32 v7, s9
+; GFX7-NEXT: v_mov_b32_e32 v8, s10
+; GFX7-NEXT: v_mov_b32_e32 v9, s11
+; GFX7-NEXT: v_mov_b32_e32 v11, s13
+; GFX7-NEXT: v_mov_b32_e32 v12, s14
+; GFX7-NEXT: v_mov_b32_e32 v13, s15
+; GFX7-NEXT: v_mov_b32_e32 v15, s17
+; GFX7-NEXT: v_mov_b32_e32 v16, s18
+; GFX7-NEXT: v_mov_b32_e32 v17, s19
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
+; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
+; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P1_v16i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v30, 0
+; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: global_load_b128 v[2:5], v30, s[0:1]
+; GFX11-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16
+; GFX11-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32
+; GFX11-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48
+; GFX11-NEXT: global_load_b128 v[18:21], v30, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v5
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: v_readfirstlane_b32 s2, v4
+; GFX11-NEXT: v_readfirstlane_b32 s19, v21
+; GFX11-NEXT: v_readfirstlane_b32 s7, v9
+; GFX11-NEXT: v_readfirstlane_b32 s16, v18
+; GFX11-NEXT: v_readfirstlane_b32 s17, v19
+; GFX11-NEXT: v_readfirstlane_b32 s18, v20
+; GFX11-NEXT: v_readfirstlane_b32 s23, v25
+; GFX11-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-NEXT: v_readfirstlane_b32 s5, v7
+; GFX11-NEXT: v_readfirstlane_b32 s6, v8
+; GFX11-NEXT: v_readfirstlane_b32 s11, v13
+; GFX11-NEXT: v_readfirstlane_b32 s20, v22
+; GFX11-NEXT: v_readfirstlane_b32 s21, v23
+; GFX11-NEXT: v_readfirstlane_b32 s22, v24
+; GFX11-NEXT: v_readfirstlane_b32 s27, v29
+; GFX11-NEXT: v_readfirstlane_b32 s8, v10
+; GFX11-NEXT: v_readfirstlane_b32 s9, v11
+; GFX11-NEXT: v_readfirstlane_b32 s10, v12
+; GFX11-NEXT: v_readfirstlane_b32 s15, v17
+; GFX11-NEXT: v_readfirstlane_b32 s24, v26
+; GFX11-NEXT: v_readfirstlane_b32 s25, v27
+; GFX11-NEXT: v_readfirstlane_b32 s26, v28
+; GFX11-NEXT: v_readfirstlane_b32 s31, v33
+; GFX11-NEXT: v_readfirstlane_b32 s12, v14
+; GFX11-NEXT: v_readfirstlane_b32 s13, v15
+; GFX11-NEXT: v_readfirstlane_b32 s14, v16
+; GFX11-NEXT: v_readfirstlane_b32 s28, v30
+; GFX11-NEXT: v_readfirstlane_b32 s29, v31
+; GFX11-NEXT: v_readfirstlane_b32 s30, v32
+; GFX11-NEXT: s_add_i32 s3, s3, s19
+; GFX11-NEXT: s_add_i32 s0, s0, s16
+; GFX11-NEXT: s_add_i32 s1, s1, s17
+; GFX11-NEXT: s_add_i32 s2, s2, s18
+; GFX11-NEXT: s_add_i32 s7, s7, s23
+; GFX11-NEXT: s_add_i32 s4, s4, s20
+; GFX11-NEXT: s_add_i32 s5, s5, s21
+; GFX11-NEXT: s_add_i32 s6, s6, s22
+; GFX11-NEXT: s_add_i32 s11, s11, s27
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: s_add_i32 s8, s8, s24
+; GFX11-NEXT: s_add_i32 s9, s9, s25
+; GFX11-NEXT: s_add_i32 s10, s10, s26
+; GFX11-NEXT: s_add_i32 s15, s15, s31
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
+; GFX11-NEXT: s_add_i32 s12, s12, s28
+; GFX11-NEXT: s_add_i32 s13, s13, s29
+; GFX11-NEXT: s_add_i32 s14, s14, s30
+; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
+; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
+; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
+; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
+; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
+; GFX11-NEXT: v_mov_b32_e32 v14, s12
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v30, 0
+; GFX12-NEXT: s_clause 0x4
+; GFX12-NEXT: global_load_b128 v[2:5], v30, s[0:1]
+; GFX12-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16
+; GFX12-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32
+; GFX12-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48
+; GFX12-NEXT: global_load_b128 v[18:21], v30, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s3, v5
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s2, v4
+; GFX12-NEXT: v_readfirstlane_b32 s19, v21
+; GFX12-NEXT: v_readfirstlane_b32 s7, v9
+; GFX12-NEXT: v_readfirstlane_b32 s16, v18
+; GFX12-NEXT: v_readfirstlane_b32 s17, v19
+; GFX12-NEXT: v_readfirstlane_b32 s18, v20
+; GFX12-NEXT: v_readfirstlane_b32 s23, v25
+; GFX12-NEXT: v_readfirstlane_b32 s4, v6
+; GFX12-NEXT: v_readfirstlane_b32 s5, v7
+; GFX12-NEXT: v_readfirstlane_b32 s6, v8
+; GFX12-NEXT: v_readfirstlane_b32 s11, v13
+; GFX12-NEXT: v_readfirstlane_b32 s20, v22
+; GFX12-NEXT: v_readfirstlane_b32 s21, v23
+; GFX12-NEXT: v_readfirstlane_b32 s22, v24
+; GFX12-NEXT: v_readfirstlane_b32 s27, v29
+; GFX12-NEXT: v_readfirstlane_b32 s8, v10
+; GFX12-NEXT: v_readfirstlane_b32 s9, v11
+; GFX12-NEXT: v_readfirstlane_b32 s10, v12
+; GFX12-NEXT: v_readfirstlane_b32 s15, v17
+; GFX12-NEXT: v_readfirstlane_b32 s24, v26
+; GFX12-NEXT: v_readfirstlane_b32 s25, v27
+; GFX12-NEXT: v_readfirstlane_b32 s26, v28
+; GFX12-NEXT: v_readfirstlane_b32 s31, v33
+; GFX12-NEXT: v_readfirstlane_b32 s12, v14
+; GFX12-NEXT: v_readfirstlane_b32 s13, v15
+; GFX12-NEXT: v_readfirstlane_b32 s14, v16
+; GFX12-NEXT: v_readfirstlane_b32 s28, v30
+; GFX12-NEXT: v_readfirstlane_b32 s29, v31
+; GFX12-NEXT: v_readfirstlane_b32 s30, v32
+; GFX12-NEXT: s_add_co_i32 s3, s3, s19
+; GFX12-NEXT: s_add_co_i32 s0, s0, s16
+; GFX12-NEXT: s_add_co_i32 s1, s1, s17
+; GFX12-NEXT: s_add_co_i32 s2, s2, s18
+; GFX12-NEXT: s_add_co_i32 s7, s7, s23
+; GFX12-NEXT: s_add_co_i32 s4, s4, s20
+; GFX12-NEXT: s_add_co_i32 s5, s5, s21
+; GFX12-NEXT: s_add_co_i32 s6, s6, s22
+; GFX12-NEXT: s_add_co_i32 s11, s11, s27
+; GFX12-NEXT: v_mov_b32_e32 v5, s3
+; GFX12-NEXT: s_add_co_i32 s8, s8, s24
+; GFX12-NEXT: s_add_co_i32 s9, s9, s25
+; GFX12-NEXT: s_add_co_i32 s10, s10, s26
+; GFX12-NEXT: s_add_co_i32 s15, s15, s31
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: s_add_co_i32 s12, s12, s28
+; GFX12-NEXT: s_add_co_i32 s13, s13, s29
+; GFX12-NEXT: s_add_co_i32 s14, s14, s30
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
+; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
+; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
+; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_mov_b32_e32 v14, s12
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-NEXT: s_endpgm
+ %a = load <16 x i32>, ptr addrspace(1) %ptra, align 2
+ %b = load volatile <16 x i32>, ptr addrspace(1) %ptra
+ %sum = add <16 x i32> %a, %b
+ store <16 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
+; GFX7-LABEL: load_divergent_P3_i8_any_extending:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_u8 v1, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b8 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_divergent_P3_i8_any_extending:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_load_u8 v1, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_store_b8 v0, v1
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_divergent_P3_i8_any_extending:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_load_u8 v1, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b8 v0, v1
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ store i8 %a, ptr addrspace(3) %out
+ ret void
+}
+
+; with true16, S16 16-bit load
+; without true16, S32 16-bit any-extending load
+define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
+; GFX7-LABEL: load_divergent_P3_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_u16 v1, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b16 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_divergent_P3_i16:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-True16-NEXT: ds_load_u16_d16 v1, v1
+; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-True16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v1.l, s0
+; GFX11-True16-NEXT: ds_store_b16 v0, v1
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_divergent_P3_i16:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NoTrue16-NEXT: ds_load_u16 v1, v1
+; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NoTrue16-NEXT: ds_store_b16 v0, v1
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_divergent_P3_i16:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-True16-NEXT: ds_load_u16_d16 v1, v1
+; GFX12-True16-NEXT: s_wait_dscnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v1
+; GFX12-True16-NEXT: s_wait_alu 0xf1ff
+; GFX12-True16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v1.l, s0
+; GFX12-True16-NEXT: ds_store_b16 v0, v1
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_divergent_P3_i16:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NoTrue16-NEXT: ds_load_u16 v1, v1
+; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0
+; GFX12-NoTrue16-NEXT: ds_store_b16 v0, v1
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ store i16 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
+; GFX7-LABEL: load_divergent_P3_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b32 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_divergent_P3_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_load_b32 v1, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_store_b32 v0, v1
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_divergent_P3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_load_b32 v1, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v0, v1
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(3) %ptra
+ store i32 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
+; GFX7-LABEL: load_divergent_P3_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_b64 v[1:2], v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b64 v0, v[1:2]
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_divergent_P3_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_load_b64 v[1:2], v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_store_b64 v0, v[1:2]
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_divergent_P3_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_load_b64 v[1:2], v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b64 v0, v[1:2]
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(3) %ptra
+ store <2 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
+; GFX7-LABEL: load_divergent_P3_v3i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_b96 v[1:3], v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b96 v0, v[1:3]
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_divergent_P3_v3i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_load_b96 v[1:3], v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_store_b96 v0, v[1:3]
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_divergent_P3_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_load_b96 v[1:3], v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b96 v0, v[1:3]
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(3) %ptra
+ store <3 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
+; GFX7-LABEL: load_divergent_P3_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_b128 v[1:4], v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b128 v0, v[1:4]
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_divergent_P3_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_load_b128 v[1:4], v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_store_b128 v0, v[1:4]
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_divergent_P3_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_load_b128 v[1:4], v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b128 v0, v[1:4]
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(3) %ptra
+ store <4 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+
+
+; constant address space, addrspace(4)
+; not uniform load mmo check for G_LOAD is for the case where MMO somehow ends
+; up with different addresspace then 4, Don't have tests for it in LLVM-IR.
+; %b in tests will end up as uniform load in sgpr
+
+; gfx12 true 16, not natural alignment
+define amdgpu_ps void @load_uniform_P4_i16_b16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_i16_b16_gfx12:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s4, s2
+; GFX7-NEXT: s_mov_b32 s5, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx12:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: s_clause 0x1
+; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc
+; GFX11-True16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: s_clause 0x1
+; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx12:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
+; GFX12-True16-NEXT: s_load_u16 s0, s[2:3], 0x0
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[2:3], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra, align 1
+ %b = load volatile i16, ptr addrspace(4) %ptrb
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11 true16, 16-bit load, not align 4
+define amdgpu_ps void @load_uniform_P4_i16_b16_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_i16_b16_gfx11:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s1, v2
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s0, s1, s0
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx11:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
+; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-True16-NEXT: s_add_i32 s0, s1, s0
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx11:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: s_clause 0x1
+; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: s_clause 0x1
+; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %b = load volatile i16, ptr addrspace(4) %ptra, align 4
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx12 without true16, 16-bit any-extending load, not natural alignment
+define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx12:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: s_clause 0x1
+; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
+; GFX11-True16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: s_clause 0x1
+; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_loadcnt 0x0
+; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
+; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra, align 1
+ %b = load volatile i16, ptr addrspace(4) %ptra
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11(or older) without true 16, s16 any-extending load, not align 4
+define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx11:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s1, v2
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s0, s1, s0
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
+; GFX11-True16: ; %bb.0:
+; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
+; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-True16-NEXT: s_add_i32 s0, s1, s0
+; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-True16-NEXT: s_endpgm
+;
+; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
+; GFX11-NoTrue16: ; %bb.0:
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NoTrue16-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: s_clause 0x1
+; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: s_clause 0x1
+; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %b = load volatile i16, ptr addrspace(4) %ptra, align 4
+ %sum = add i16 %a, %b
+ store i16 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 32-bit load load, not align 4
+define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s1, v2
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s0, s1, s0
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P4_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_b32 v2, v2, s[0:1]
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_b32 v2, v2, s[0:1]
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(4) %ptra, align 2
+ %b = load volatile i32, ptr addrspace(4) %ptra
+ %sum = add i32 %a, %b
+ store i32 %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 64bit load load, not align 4
+define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s1, s5, s1
+; GFX7-NEXT: s_add_i32 s0, s4, s0
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P4_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s0, s2, s0
+; GFX11-NEXT: s_add_i32 s1, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s2, v2
+; GFX12-NEXT: v_readfirstlane_b32 s3, v3
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s2, s0
+; GFX12-NEXT: s_add_co_i32 s1, s3, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(4) %ptra, align 2
+ %b = load volatile <2 x i32>, ptr addrspace(4) %ptra
+ %sum = add <2 x i32> %a, %b
+ store <2 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 96bit load load, not align 4
+define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_v3i32_gfx12:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s7, v4
+; GFX7-NEXT: s_add_i32 s4, s0, s4
+; GFX7-NEXT: s_add_i32 s5, s1, s5
+; GFX7-NEXT: s_add_i32 s6, s7, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P4_v3i32_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1]
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s5, v4
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v3
+; GFX11-NEXT: s_add_i32 s2, s5, s2
+; GFX11-NEXT: s_add_i32 s0, s3, s0
+; GFX11-NEXT: s_add_i32 s1, s4, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v3i32_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1]
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v4
+; GFX12-NEXT: v_readfirstlane_b32 s3, v2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v3
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s2, s5, s2
+; GFX12-NEXT: s_add_co_i32 s0, s3, s0
+; GFX12-NEXT: s_add_co_i32 s1, s4, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(4) %ptra, align 2
+ %b = load volatile <3 x i32>, ptr addrspace(4) %ptra
+ %sum = add <3 x i32> %a, %b
+ store <3 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 128-bit load load, not align 4
+define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s1, v3
+; GFX7-NEXT: v_readfirstlane_b32 s8, v4
+; GFX7-NEXT: v_readfirstlane_b32 s9, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s0, s4
+; GFX7-NEXT: s_add_i32 s5, s1, s5
+; GFX7-NEXT: s_add_i32 s6, s8, s6
+; GFX7-NEXT: s_add_i32 s7, s9, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P4_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1]
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s7, v5
+; GFX11-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-NEXT: v_readfirstlane_b32 s5, v3
+; GFX11-NEXT: v_readfirstlane_b32 s6, v4
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s3, s7, s3
+; GFX11-NEXT: s_add_i32 s0, s4, s0
+; GFX11-NEXT: s_add_i32 s1, s5, s1
+; GFX11-NEXT: s_add_i32 s2, s6, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1]
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s7, v5
+; GFX12-NEXT: v_readfirstlane_b32 s4, v2
+; GFX12-NEXT: v_readfirstlane_b32 s5, v3
+; GFX12-NEXT: v_readfirstlane_b32 s6, v4
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s3, s7, s3
+; GFX12-NEXT: s_add_co_i32 s0, s4, s0
+; GFX12-NEXT: s_add_co_i32 s1, s5, s1
+; GFX12-NEXT: s_add_co_i32 s2, s6, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(4) %ptra, align 2
+ %b = load volatile <4 x i32>, ptr addrspace(4) %ptra
+ %sum = add <4 x i32> %a, %b
+ store <4 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 256bit load load, not align 4
+define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_v8i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_readfirstlane_b32 s12, v2
+; GFX7-NEXT: v_readfirstlane_b32 s13, v3
+; GFX7-NEXT: v_readfirstlane_b32 s14, v4
+; GFX7-NEXT: v_readfirstlane_b32 s15, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s16, v6
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s12, s4
+; GFX7-NEXT: v_readfirstlane_b32 s17, v7
+; GFX7-NEXT: v_readfirstlane_b32 s18, v8
+; GFX7-NEXT: v_readfirstlane_b32 s19, v9
+; GFX7-NEXT: s_add_i32 s5, s13, s5
+; GFX7-NEXT: s_add_i32 s6, s14, s6
+; GFX7-NEXT: s_add_i32 s7, s15, s7
+; GFX7-NEXT: s_add_i32 s8, s16, s8
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_add_i32 s9, s17, s9
+; GFX7-NEXT: s_add_i32 s10, s18, s10
+; GFX7-NEXT: s_add_i32 s11, s19, s11
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-NEXT: v_mov_b32_e32 v6, s8
+; GFX7-NEXT: v_mov_b32_e32 v7, s9
+; GFX7-NEXT: v_mov_b32_e32 v8, s10
+; GFX7-NEXT: v_mov_b32_e32 v9, s11
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P4_v8i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1]
+; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s11, v5
+; GFX11-NEXT: v_readfirstlane_b32 s8, v2
+; GFX11-NEXT: v_readfirstlane_b32 s9, v3
+; GFX11-NEXT: v_readfirstlane_b32 s10, v4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s15, v9
+; GFX11-NEXT: v_readfirstlane_b32 s12, v6
+; GFX11-NEXT: v_readfirstlane_b32 s13, v7
+; GFX11-NEXT: v_readfirstlane_b32 s14, v8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s3, s11, s3
+; GFX11-NEXT: s_add_i32 s0, s8, s0
+; GFX11-NEXT: s_add_i32 s1, s9, s1
+; GFX11-NEXT: s_add_i32 s2, s10, s2
+; GFX11-NEXT: s_add_i32 s7, s15, s7
+; GFX11-NEXT: s_add_i32 s4, s12, s4
+; GFX11-NEXT: s_add_i32 s5, s13, s5
+; GFX11-NEXT: s_add_i32 s6, s14, s6
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v6, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1]
+; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16
+; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_readfirstlane_b32 s11, v5
+; GFX12-NEXT: v_readfirstlane_b32 s8, v2
+; GFX12-NEXT: v_readfirstlane_b32 s9, v3
+; GFX12-NEXT: v_readfirstlane_b32 s10, v4
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s15, v9
+; GFX12-NEXT: v_readfirstlane_b32 s12, v6
+; GFX12-NEXT: v_readfirstlane_b32 s13, v7
+; GFX12-NEXT: v_readfirstlane_b32 s14, v8
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s3, s11, s3
+; GFX12-NEXT: s_add_co_i32 s0, s8, s0
+; GFX12-NEXT: s_add_co_i32 s1, s9, s1
+; GFX12-NEXT: s_add_co_i32 s2, s10, s2
+; GFX12-NEXT: s_add_co_i32 s7, s15, s7
+; GFX12-NEXT: s_add_co_i32 s4, s12, s4
+; GFX12-NEXT: s_add_co_i32 s5, s13, s5
+; GFX12-NEXT: s_add_co_i32 s6, s14, s6
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: s_endpgm
+ %a = load <8 x i32>, ptr addrspace(4) %ptra, align 2
+ %b = load volatile <8 x i32>, ptr addrspace(4) %ptra
+ %sum = add <8 x i32> %a, %b
+ store <8 x i32> %sum, ptr addrspace(1) %out
+ ret void
+}
+
+; any target, 512bit load load, not align 4
+define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX7-LABEL: load_uniform_P4_v16i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48
+; GFX7-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_readfirstlane_b32 s20, v2
+; GFX7-NEXT: v_readfirstlane_b32 s21, v3
+; GFX7-NEXT: v_readfirstlane_b32 s22, v4
+; GFX7-NEXT: v_readfirstlane_b32 s23, v5
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_readfirstlane_b32 s24, v6
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_readfirstlane_b32 s28, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s33, v14
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s20, s4
+; GFX7-NEXT: v_readfirstlane_b32 s25, v7
+; GFX7-NEXT: v_readfirstlane_b32 s26, v8
+; GFX7-NEXT: v_readfirstlane_b32 s27, v9
+; GFX7-NEXT: v_readfirstlane_b32 s29, v11
+; GFX7-NEXT: v_readfirstlane_b32 s30, v12
+; GFX7-NEXT: v_readfirstlane_b32 s31, v13
+; GFX7-NEXT: v_readfirstlane_b32 s34, v15
+; GFX7-NEXT: v_readfirstlane_b32 s35, v16
+; GFX7-NEXT: v_readfirstlane_b32 s36, v17
+; GFX7-NEXT: s_add_i32 s5, s21, s5
+; GFX7-NEXT: s_add_i32 s6, s22, s6
+; GFX7-NEXT: s_add_i32 s7, s23, s7
+; GFX7-NEXT: s_add_i32 s8, s24, s8
+; GFX7-NEXT: s_add_i32 s12, s28, s12
+; GFX7-NEXT: s_add_i32 s16, s33, s16
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_add_i32 s9, s25, s9
+; GFX7-NEXT: s_add_i32 s10, s26, s10
+; GFX7-NEXT: s_add_i32 s11, s27, s11
+; GFX7-NEXT: s_add_i32 s13, s29, s13
+; GFX7-NEXT: s_add_i32 s14, s30, s14
+; GFX7-NEXT: s_add_i32 s15, s31, s15
+; GFX7-NEXT: s_add_i32 s17, s34, s17
+; GFX7-NEXT: s_add_i32 s18, s35, s18
+; GFX7-NEXT: s_add_i32 s19, s36, s19
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-NEXT: v_mov_b32_e32 v6, s8
+; GFX7-NEXT: v_mov_b32_e32 v10, s12
+; GFX7-NEXT: v_mov_b32_e32 v14, s16
+; GFX7-NEXT: v_mov_b32_e32 v7, s9
+; GFX7-NEXT: v_mov_b32_e32 v8, s10
+; GFX7-NEXT: v_mov_b32_e32 v9, s11
+; GFX7-NEXT: v_mov_b32_e32 v11, s13
+; GFX7-NEXT: v_mov_b32_e32 v12, s14
+; GFX7-NEXT: v_mov_b32_e32 v13, s15
+; GFX7-NEXT: v_mov_b32_e32 v15, s17
+; GFX7-NEXT: v_mov_b32_e32 v16, s18
+; GFX7-NEXT: v_mov_b32_e32 v17, s19
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
+; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
+; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48
+; GFX7-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_uniform_P4_v16i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v14, 0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1]
+; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
+; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32
+; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48
+; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_readfirstlane_b32 s19, v5
+; GFX11-NEXT: v_readfirstlane_b32 s16, v2
+; GFX11-NEXT: v_readfirstlane_b32 s17, v3
+; GFX11-NEXT: v_readfirstlane_b32 s18, v4
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_readfirstlane_b32 s23, v9
+; GFX11-NEXT: v_readfirstlane_b32 s20, v6
+; GFX11-NEXT: v_readfirstlane_b32 s21, v7
+; GFX11-NEXT: v_readfirstlane_b32 s22, v8
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s27, v13
+; GFX11-NEXT: v_readfirstlane_b32 s24, v10
+; GFX11-NEXT: v_readfirstlane_b32 s25, v11
+; GFX11-NEXT: v_readfirstlane_b32 s26, v12
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s31, v17
+; GFX11-NEXT: v_readfirstlane_b32 s28, v14
+; GFX11-NEXT: v_readfirstlane_b32 s29, v15
+; GFX11-NEXT: v_readfirstlane_b32 s30, v16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s3, s19, s3
+; GFX11-NEXT: s_add_i32 s0, s16, s0
+; GFX11-NEXT: s_add_i32 s1, s17, s1
+; GFX11-NEXT: s_add_i32 s2, s18, s2
+; GFX11-NEXT: s_add_i32 s7, s23, s7
+; GFX11-NEXT: s_add_i32 s4, s20, s4
+; GFX11-NEXT: s_add_i32 s5, s21, s5
+; GFX11-NEXT: s_add_i32 s6, s22, s6
+; GFX11-NEXT: s_add_i32 s11, s27, s11
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: s_add_i32 s8, s24, s8
+; GFX11-NEXT: s_add_i32 s9, s25, s9
+; GFX11-NEXT: s_add_i32 s10, s26, s10
+; GFX11-NEXT: s_add_i32 s15, s31, s15
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
+; GFX11-NEXT: s_add_i32 s12, s28, s12
+; GFX11-NEXT: s_add_i32 s13, s29, s13
+; GFX11-NEXT: s_add_i32 s14, s30, s14
+; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
+; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
+; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
+; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
+; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
+; GFX11-NEXT: v_mov_b32_e32 v14, s12
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v14, 0
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1]
+; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
+; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32
+; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48
+; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x3
+; GFX12-NEXT: v_readfirstlane_b32 s19, v5
+; GFX12-NEXT: v_readfirstlane_b32 s16, v2
+; GFX12-NEXT: v_readfirstlane_b32 s17, v3
+; GFX12-NEXT: v_readfirstlane_b32 s18, v4
+; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: v_readfirstlane_b32 s23, v9
+; GFX12-NEXT: v_readfirstlane_b32 s20, v6
+; GFX12-NEXT: v_readfirstlane_b32 s21, v7
+; GFX12-NEXT: v_readfirstlane_b32 s22, v8
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_readfirstlane_b32 s27, v13
+; GFX12-NEXT: v_readfirstlane_b32 s24, v10
+; GFX12-NEXT: v_readfirstlane_b32 s25, v11
+; GFX12-NEXT: v_readfirstlane_b32 s26, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s31, v17
+; GFX12-NEXT: v_readfirstlane_b32 s28, v14
+; GFX12-NEXT: v_readfirstlane_b32 s29, v15
+; GFX12-NEXT: v_readfirstlane_b32 s30, v16
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s3, s19, s3
+; GFX12-NEXT: s_add_co_i32 s0, s16, s0
+; GFX12-NEXT: s_add_co_i32 s1, s17, s1
+; GFX12-NEXT: s_add_co_i32 s2, s18, s2
+; GFX12-NEXT: s_add_co_i32 s7, s23, s7
+; GFX12-NEXT: s_add_co_i32 s4, s20, s4
+; GFX12-NEXT: s_add_co_i32 s5, s21, s5
+; GFX12-NEXT: s_add_co_i32 s6, s22, s6
+; GFX12-NEXT: s_add_co_i32 s11, s27, s11
+; GFX12-NEXT: v_mov_b32_e32 v5, s3
+; GFX12-NEXT: s_add_co_i32 s8, s24, s8
+; GFX12-NEXT: s_add_co_i32 s9, s25, s9
+; GFX12-NEXT: s_add_co_i32 s10, s26, s10
+; GFX12-NEXT: s_add_co_i32 s15, s31, s15
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: s_add_co_i32 s12, s28, s12
+; GFX12-NEXT: s_add_co_i32 s13, s29, s13
+; GFX12-NEXT: s_add_co_i32 s14, s30, s14
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
+; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
+; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
+; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_mov_b32_e32 v14, s12
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-NEXT: s_endpgm
+ %a = load <16 x i32>, ptr addrspace(4) %ptra, align 2
+ %b = load volatile <16 x i32>, ptr addrspace(4) %ptra
+ %sum = add <16 x i32> %a, %b
+ store <16 x i32> %sum, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
new file mode 100644
index 0000000000000..bf36deac33380
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
@@ -0,0 +1,602 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s
+
+; global address space, addrspace(1)
+
+; gfx12, true16 is S16 16-bit load
+; gfx12, without true 16 is S32 16-bit any-extending load
+define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_i16_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P1_i16_gfx12:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_gfx12:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11, and older true16 is S16 16-bit load
+; gfx11, and older without true 16 is S32 16-bit any-extending load
+; both cases require align 4 and uniform mmo to widen mmo to 32-bit load
+define amdgpu_ps void @load_uniform_P1_i16_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 4
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx12, S32 8-bit anyextending load, no difference regarding true 16
+define amdgpu_ps void @load_uniform_P1_i8_any_extending_load(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_i8_any_extending_load:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_u8 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_i8_any_extending_load:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16
+define amdgpu_ps void @load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra, align 4
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(1) %ptra
+ store i32 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(1) %ptra
+ store <2 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11, S96 load align 16(default) to load S128
+define amdgpu_ps void @load_uniform_P1_v3i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_v3i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(1) %ptra
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11, S96 load align 4 to load S64 + load S32
+define amdgpu_ps void @load_uniform_P1_v3i32_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_v3i32_align4_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: v_mov_b32_e32 v4, s6
+; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v3i32_align4_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(1) %ptra, align 4
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(1) %ptra
+ store <4 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_v8i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: s_endpgm
+ %a = load <8 x i32>, ptr addrspace(1) %ptra
+ store <8 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P1_v16i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
+; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
+; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
+; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P1_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
+; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
+; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
+; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-NEXT: s_endpgm
+ %a = load <16 x i32>, ptr addrspace(1) %ptra
+ store <16 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+; constant address space, addrspace(4)
+
+define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_i16_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P4_i16_gfx12:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_gfx12:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P4_i16_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-True16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11:
+; GFX12-True16: ; %bb.0:
+; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-True16-NEXT: s_wait_kmcnt 0x0
+; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-True16-NEXT: s_endpgm
+;
+; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11:
+; GFX12-NoTrue16: ; %bb.0:
+; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
+; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NoTrue16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra, align 4
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx12, S32 8-bit anyextending load, no difference regarding true 16
+define amdgpu_ps void @load_uniform_P4_i8_any_extending_load(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_i8_any_extending_load:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_u8 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_i8_any_extending_load:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16
+define amdgpu_ps void @load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra, align 4
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i32, ptr addrspace(4) %ptra
+ store i32 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %a = load <2 x i32>, ptr addrspace(4) %ptra
+ store <2 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11, S96 load align 16(default) to load S128
+define amdgpu_ps void @load_uniform_P4_v3i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_v3i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(4) %ptra
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+; gfx11, S96 load align 4 to load S64 + load S32
+define amdgpu_ps void @load_uniform_P4_v3i32_align4_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_v3i32_align4_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: v_mov_b32_e32 v4, s6
+; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v3i32_align4_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT: s_endpgm
+ %a = load <3 x i32>, ptr addrspace(4) %ptra, align 4
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: s_endpgm
+ %a = load <4 x i32>, ptr addrspace(4) %ptra
+ store <4 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_v8i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: s_endpgm
+ %a = load <8 x i32>, ptr addrspace(4) %ptra
+ store <8 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: load_uniform_P4_v16i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
+; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
+; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
+; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: load_uniform_P4_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
+; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
+; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
+; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
+; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
+; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-NEXT: s_endpgm
+ %a = load <16 x i32>, ptr addrspace(4) %ptra
+ store <16 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll
new file mode 100644
index 0000000000000..312d5b4e4c3bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @sextload_P0_i8(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_i8 v0, v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P0_i16(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_i16 v0, v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_u8 v0, v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P0_i16(ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_u16 v0, v[0:1]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: flat_store_b32 v[2:3], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_i8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_i16 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u16 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_i8 v0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: ds_store_b32 v1, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P3_i16(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_i16 v0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: ds_store_b32 v1, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u8 v0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: ds_store_b32 v1, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P3_i16(ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u16 v0, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: ds_store_b32 v1, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_i8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i16(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_i16 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i16(ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u16 v0, v[0:1], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: global_store_b32 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P5_i8(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: sextload_P5_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_i8 v0, v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P5_i16(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: sextload_P5_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_i16 v0, v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(5) %ptra
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P5_i8(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: zextload_P5_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_u8 v0, v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P5_i16(ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: zextload_P5_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_u16 v0, v0, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0
+; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(5) %ptra
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(5) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll
new file mode 100644
index 0000000000000..f12ec4dff8549
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll
@@ -0,0 +1,608 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX12 %s
+
+define amdgpu_ps void @sextload_and_zextload_P1_i8_not_uniform_mmo_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_and_zextload_P1_i8_not_uniform_mmo_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_i8 v3, v2, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_u8 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_and_zextload_P1_i8_not_uniform_mmo_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_i8 v3, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_load_u8 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load volatile i8, ptr addrspace(1) %ptra
+ %a32 = sext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(1) %ptrb
+ %b32 = zext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_i8 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_i8 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0
+; GFX12-NEXT: global_load_i8 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a32 = sext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(1) %ptrb, align 4
+ %b32 = sext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_u8 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: global_load_u8 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a32 = zext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(1) %ptrb, align 4
+ %b32 = zext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_i16 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_i16 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_i16 v3, v2, s[0:1]
+; GFX12-NEXT: global_load_i16 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 1
+ %a32 = sext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(1) %ptrb
+ %b32 = sext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_i16 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_i16 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0
+; GFX12-NEXT: global_load_i16 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %a32 = sext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(1) %ptrb, align 4
+ %b32 = sext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX12-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 1
+ %a32 = zext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(1) %ptrb
+ %b32 = zext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %a32 = zext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(1) %ptrb, align 4
+ %b32 = zext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @sextload_and_zextload_P3_i8(ptr addrspace(3) inreg %ptra, ptr addrspace(3) inreg %ptrb, ptr addrspace(3) %out) {
+; GFX11-LABEL: sextload_and_zextload_P3_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX11-NEXT: ds_load_i8 v1, v1
+; GFX11-NEXT: ds_load_u8 v2, v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_store_b32 v0, v1
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_and_zextload_P3_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: ds_load_i8 v1, v1
+; GFX12-NEXT: ds_load_u8 v2, v2
+; GFX12-NEXT: s_wait_dscnt 0x1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_store_b32 v0, v1
+; GFX12-NEXT: s_endpgm
+ %a = load volatile i8, ptr addrspace(3) %ptra
+ %a32 = sext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(3) %ptrb
+ %b32 = zext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_and_zextload_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) inreg %ptrb, ptr addrspace(3) %out) {
+; GFX11-LABEL: sextload_and_zextload_P3_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX11-NEXT: ds_load_i16 v1, v1
+; GFX11-NEXT: ds_load_u16 v2, v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: ds_store_b32 v0, v1
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_and_zextload_P3_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: ds_load_i16 v1, v1
+; GFX12-NEXT: ds_load_u16 v2, v2
+; GFX12-NEXT: s_wait_dscnt 0x1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: ds_store_b32 v0, v1
+; GFX12-NEXT: s_endpgm
+ %a = load volatile i16, ptr addrspace(3) %ptra
+ %a32 = sext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(3) %ptrb
+ %b32 = zext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(3) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @sextload_and_zextload_P4_i8_not_uniform_mmo_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_and_zextload_P4_i8_not_uniform_mmo_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_i8 v3, v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u8 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_and_zextload_P4_i8_not_uniform_mmo_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load volatile i8, ptr addrspace(4) %ptra
+ %a32 = sext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(4) %ptrb
+ %b32 = zext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_i8 v2, v2, s[0:1]
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sext_i32_i8 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_load_i8 s1, s[2:3], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a32 = sext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(4) %ptrb, align 4
+ %b32 = sext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_u8 v2, v2, s[0:1]
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a32 = zext i8 %a to i32
+ %b = load volatile i8, ptr addrspace(4) %ptrb, align 4
+ %b32 = zext i8 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_i16 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_i16 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_i16 v2, v2, s[0:1]
+; GFX12-NEXT: s_load_i16 s0, s[2:3], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra, align 1
+ %a32 = sext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(4) %ptrb
+ %b32 = sext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_i16 v2, v2, s[0:1]
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sext_i32_i16 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0
+; GFX12-NEXT: s_load_i16 s1, s[2:3], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %a32 = sext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(4) %ptrb, align 4
+ %b32 = sext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX12-NEXT: s_load_u16 s0, s[2:3], 0x0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s1, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra, align 1
+ %a32 = zext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(4) %ptrb
+ %b32 = zext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NEXT: s_load_u16 s1, s[2:3], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %a32 = zext i16 %a to i32
+ %b = load volatile i16, ptr addrspace(4) %ptrb, align 4
+ %b32 = zext i16 %b to i32
+ %res = add i32 %a32, %b32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll
new file mode 100644
index 0000000000000..e094a1451f42c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @sextload_P1_i8_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i8_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_i8 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i8_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i8_align4_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sext_i32_i8 s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i8_align4_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra, align 4
+ %a32 = sext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i16_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_i16 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i16_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i16_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: sextload_P1_i16_align4_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sext_i32_i16 s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sextload_P1_i16_align4_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 4
+ %a32 = sext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i8_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_u8 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i8_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i8_align4_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i8_align4_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra, align 4
+ %a32 = zext i8 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i16_gfx12:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_u16 v2, v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i16_gfx12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i16_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
+; GFX11-LABEL: zextload_P1_i16_align4_gfx11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: zextload_P1_i16_align4_gfx11:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra, align 4
+ %a32 = zext i16 %a to i32
+ %res = add i32 %a32, %a32
+ store i32 %res, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
index dc782aa08ae99..39eb41f387cf8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -o - %s | FileCheck %s
define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspace(6) inreg %arg3) {
; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
index dd7a3ebeab471..1c1cda2157c9f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
@@ -18,8 +18,7 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -74,8 +73,7 @@ body: |
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -132,8 +130,7 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[LOAD]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[LOAD]](s64), [[MV1]](p1) :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -188,8 +185,7 @@ body: |
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[BITCAST]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s64), [[MV1]](p1) :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -247,8 +243,7 @@ body: |
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[UV1]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[UV1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -307,8 +302,7 @@ body: |
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1)
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index e448c4cba0941..d52b5fe9df247 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -119,10 +119,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
@@ -152,10 +151,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
@@ -192,16 +190,15 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1)
- ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1)
- ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
@@ -238,16 +235,15 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1)
- ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1)
- ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
@@ -368,10 +364,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
@@ -400,10 +395,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY1]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s256)
@@ -433,10 +427,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY1]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
@@ -465,10 +458,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
@@ -505,16 +497,15 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4)
- ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4)
- ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
@@ -551,16 +542,15 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
- ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4)
- ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4)
- ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
@@ -736,8 +726,7 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4)
; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
;
; GFX12-LABEL: name: extload_constant_i8_to_i32_uniform
@@ -762,8 +751,7 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 1)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1)
@@ -782,8 +770,7 @@ body: |
; GFX7: liveins: $sgpr0_sgpr1
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
;
; GFX12-LABEL: name: extload_constant_i16_to_i32_uniform
@@ -808,8 +795,7 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 1)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2)
@@ -845,8 +831,7 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 2, addrspace 4)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2)
@@ -865,8 +850,7 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 1, addrspace 4)
; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir
index b257db4f1e665..032357f611dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s
---
name: sextload_constant_i8_to_i32_uniform
@@ -13,8 +12,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 4)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_SEXTLOAD %0 :: (load (s8), addrspace 4, align 1)
...
@@ -31,8 +30,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
+ ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_SEXTLOAD %0 :: (load (s8), addrspace 1, align 1)
...
@@ -49,8 +48,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 4)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_SEXTLOAD %0 :: (load (s16), addrspace 4, align 2)
...
@@ -67,8 +66,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
+ ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_SEXTLOAD %0 :: (load (s16), addrspace 1, align 2)
...
@@ -86,6 +85,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p3) :: (load (s8), addrspace 3)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_SEXTLOAD %0 :: (load (s8), addrspace 3, align 1)
...
@@ -104,6 +104,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p3) :: (load (s16), addrspace 3)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_SEXTLOAD %0 :: (load (s16), addrspace 3, align 2)
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir
index efdf4b7f25fd7..3fa90e315fc27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri -run-pass=regbankselect -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX7 %s
-# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -run-pass=regbankselect -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX1010 %s
+# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX7 %s
+# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX1010 %s
+
+# FixMe: need merge/unmerge artifact combine
---
name: test_uniform_load_without_noclobber
@@ -16,27 +18,46 @@ body: |
; GFX7-NEXT: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX7-NEXT: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load (<4 x s32>), align 4, addrspace 1)
- ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C]](s64)
+ ; GFX7-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst16(s64)
; GFX7-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from unknown-address + 16, align 4, addrspace 1)
- ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C1]](s64)
+ ; GFX7-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst32(s64)
; GFX7-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from unknown-address + 32, align 4, addrspace 1)
- ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C2]](s64)
+ ; GFX7-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst48(s64)
; GFX7-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from unknown-address + 48, align 4, addrspace 1)
- ; GFX7-NEXT: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
- ; GFX7-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>)
- ; GFX7-NEXT: G_STORE %load0_3(<4 x s32>), %out_addr(p1) :: (store (<4 x s32>), align 4, addrspace 1)
- ; GFX7-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]]
+ ; GFX7-NEXT: %load:sgpr(<16 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32), [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32), [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32), [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32), [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32)
+ ; GFX7-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>), %load8_11:sgpr(<4 x s32>), %load12_15:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>)
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>)
+ ; GFX7-NEXT: G_STORE [[COPY]](<4 x s32>), %out_addr(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX7-NEXT: %out_addr_plus_16:sgpr(p1) = G_PTR_ADD %out_addr, %cst16(s64)
- ; GFX7-NEXT: G_STORE %load4_7(<4 x s32>), %out_addr_plus_16(p1) :: (store (<4 x s32>), align 4, addrspace 1)
- ; GFX7-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>)
+ ; GFX7-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_addr_plus_16(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX7-NEXT: %out_addr_plus_32:sgpr(p1) = G_PTR_ADD %out_addr, %cst32(s64)
- ; GFX7-NEXT: G_STORE %load8_11(<4 x s32>), %out_addr_plus_32(p1) :: (store (<4 x s32>), align 4, addrspace 1)
- ; GFX7-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load8_11(<4 x s32>)
+ ; GFX7-NEXT: G_STORE [[COPY2]](<4 x s32>), %out_addr_plus_32(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX7-NEXT: %out_addr_plus_48:sgpr(p1) = G_PTR_ADD %out_addr, %cst48(s64)
- ; GFX7-NEXT: G_STORE %load12_15(<4 x s32>), %out_addr_plus_48(p1) :: (store (<4 x s32>), align 4, addrspace 1)
+ ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load12_15(<4 x s32>)
+ ; GFX7-NEXT: G_STORE [[COPY3]](<4 x s32>), %out_addr_plus_48(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
;
; GFX1010-LABEL: name: test_uniform_load_without_noclobber
@@ -44,33 +65,47 @@ body: |
; GFX1010-NEXT: {{ $}}
; GFX1010-NEXT: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX1010-NEXT: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3
- ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY %in_addr(p1)
; GFX1010-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load (<4 x s32>), align 4, addrspace 1)
- ; GFX1010-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C]](s64)
+ ; GFX1010-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst16(s64)
; GFX1010-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from unknown-address + 16, align 4, addrspace 1)
- ; GFX1010-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GFX1010-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C1]](s64)
+ ; GFX1010-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX1010-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst32(s64)
; GFX1010-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from unknown-address + 32, align 4, addrspace 1)
- ; GFX1010-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GFX1010-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C2]](s64)
+ ; GFX1010-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX1010-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst48(s64)
; GFX1010-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from unknown-address + 48, align 4, addrspace 1)
- ; GFX1010-NEXT: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
- ; GFX1010-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>)
- ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY %out_addr(p1)
- ; GFX1010-NEXT: G_STORE %load0_3(<4 x s32>), [[COPY1]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
- ; GFX1010-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX1010-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; GFX1010-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]]
+ ; GFX1010-NEXT: %load:sgpr(<16 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32), [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32), [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32), [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32), [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32)
+ ; GFX1010-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>), %load8_11:sgpr(<4 x s32>), %load12_15:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>)
+ ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>)
+ ; GFX1010-NEXT: G_STORE [[COPY]](<4 x s32>), %out_addr(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX1010-NEXT: %out_addr_plus_16:sgpr(p1) = G_PTR_ADD %out_addr, %cst16(s64)
- ; GFX1010-NEXT: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_16(p1)
- ; GFX1010-NEXT: G_STORE %load4_7(<4 x s32>), [[COPY2]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
- ; GFX1010-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>)
+ ; GFX1010-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_addr_plus_16(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX1010-NEXT: %out_addr_plus_32:sgpr(p1) = G_PTR_ADD %out_addr, %cst32(s64)
- ; GFX1010-NEXT: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_32(p1)
- ; GFX1010-NEXT: G_STORE %load8_11(<4 x s32>), [[COPY3]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
- ; GFX1010-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX1010-NEXT: [[COPY2:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load8_11(<4 x s32>)
+ ; GFX1010-NEXT: G_STORE [[COPY2]](<4 x s32>), %out_addr_plus_32(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX1010-NEXT: %out_addr_plus_48:sgpr(p1) = G_PTR_ADD %out_addr, %cst48(s64)
- ; GFX1010-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_48(p1)
- ; GFX1010-NEXT: G_STORE %load12_15(<4 x s32>), [[COPY4]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+ ; GFX1010-NEXT: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load12_15(<4 x s32>)
+ ; GFX1010-NEXT: G_STORE [[COPY3]](<4 x s32>), %out_addr_plus_48(p1) :: (store (<4 x s32>), align 4, addrspace 1)
; GFX1010-NEXT: S_ENDPGM 0
%in_addr:_(p1) = COPY $sgpr0_sgpr1
%out_addr:_(p1) = COPY $sgpr2_sgpr3
@@ -103,15 +138,26 @@ body: |
; GFX7-NEXT: %ptr:sgpr(p4) = COPY $sgpr0_sgpr1
; GFX7-NEXT: %out:sgpr(p1) = COPY $sgpr2_sgpr3
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %ptr(p4) :: (load (<4 x s32>), align 1, addrspace 4)
- ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64)
- ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4)
- ; GFX7-NEXT: %load:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
- ; GFX7-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>)
- ; GFX7-NEXT: G_STORE %load0_3(<4 x s32>), %out(p1) :: (store (<4 x s32>), align 32, addrspace 1)
; GFX7-NEXT: %cst_16:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD %ptr, %cst_16(s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4)
+ ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GFX7-NEXT: %load:sgpr(<8 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
+ ; GFX7-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>)
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>)
+ ; GFX7-NEXT: G_STORE [[COPY]](<4 x s32>), %out(p1) :: (store (<4 x s32>), align 32, addrspace 1)
; GFX7-NEXT: %out_plus_16:sgpr(p1) = G_PTR_ADD %out, %cst_16(s64)
- ; GFX7-NEXT: G_STORE %load4_7(<4 x s32>), %out_plus_16(p1) :: (store (<4 x s32>), align 32, addrspace 1)
+ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>)
+ ; GFX7-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_plus_16(p1) :: (store (<4 x s32>), align 32, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
;
; GFX1010-LABEL: name: test_s_load_constant_v8i32_align1
@@ -119,19 +165,27 @@ body: |
; GFX1010-NEXT: {{ $}}
; GFX1010-NEXT: %ptr:sgpr(p4) = COPY $sgpr0_sgpr1
; GFX1010-NEXT: %out:sgpr(p1) = COPY $sgpr2_sgpr3
- ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %ptr(p4)
; GFX1010-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %ptr(p4) :: (load (<4 x s32>), align 1, addrspace 4)
- ; GFX1010-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64)
- ; GFX1010-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4)
- ; GFX1010-NEXT: %load:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
- ; GFX1010-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>)
- ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY %out(p1)
- ; GFX1010-NEXT: G_STORE %load0_3(<4 x s32>), [[COPY1]](p1) :: (store (<4 x s32>), align 32, addrspace 1)
; GFX1010-NEXT: %cst_16:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD %ptr, %cst_16(s64)
+ ; GFX1010-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4)
+ ; GFX1010-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GFX1010-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GFX1010-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GFX1010-NEXT: %load:sgpr(<8 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
+ ; GFX1010-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>)
+ ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>)
+ ; GFX1010-NEXT: G_STORE [[COPY]](<4 x s32>), %out(p1) :: (store (<4 x s32>), align 32, addrspace 1)
; GFX1010-NEXT: %out_plus_16:sgpr(p1) = G_PTR_ADD %out, %cst_16(s64)
- ; GFX1010-NEXT: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY %out_plus_16(p1)
- ; GFX1010-NEXT: G_STORE %load4_7(<4 x s32>), [[COPY2]](p1) :: (store (<4 x s32>), align 32, addrspace 1)
+ ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>)
+ ; GFX1010-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_plus_16(p1) :: (store (<4 x s32>), align 32, addrspace 1)
; GFX1010-NEXT: S_ENDPGM 0
%ptr:_(p4) = COPY $sgpr0_sgpr1
%out:_(p1) = COPY $sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
index f1f8d0b6b9df5..7838e979befef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
@@ -1,7 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+
---
name: constant_load_i8_align8
legalized: true
@@ -15,12 +16,14 @@ body: |
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX9-LABEL: name: constant_load_i8_align8
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX10-LABEL: name: constant_load_i8_align8
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -44,12 +47,14 @@ body: |
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX9-LABEL: name: constant_load_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX10-LABEL: name: constant_load_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -73,12 +78,14 @@ body: |
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX9-LABEL: name: constant_load_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX10-LABEL: name: constant_load_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -103,6 +110,7 @@ body: |
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ;
; GFX9-LABEL: name: constant_sextload_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -110,6 +118,7 @@ body: |
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ;
; GFX10-LABEL: name: constant_sextload_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -135,6 +144,7 @@ body: |
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ;
; GFX9-LABEL: name: constant_sextload_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -142,6 +152,7 @@ body: |
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4)
; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ;
; GFX10-LABEL: name: constant_sextload_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -169,6 +180,7 @@ body: |
; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
+ ;
; GFX9-LABEL: name: constant_zextload_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -177,6 +189,7 @@ body: |
; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
+ ;
; GFX10-LABEL: name: constant_zextload_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -204,6 +217,7 @@ body: |
; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
+ ;
; GFX9-LABEL: name: constant_zextload_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -212,6 +226,7 @@ body: |
; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
+ ;
; GFX10-LABEL: name: constant_zextload_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -237,12 +252,14 @@ body: |
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX9-LABEL: name: global_load_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX10-LABEL: name: global_load_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -266,12 +283,14 @@ body: |
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX9-LABEL: name: global_load_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX10-LABEL: name: global_load_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -296,6 +315,7 @@ body: |
; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ;
; GFX9-LABEL: name: global_sextload_i8_alig4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -303,6 +323,7 @@ body: |
; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1)
; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ;
; GFX10-LABEL: name: global_sextload_i8_alig4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -329,6 +350,7 @@ body: |
; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
+ ;
; GFX9-LABEL: name: global_zextload_i16_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -337,6 +359,7 @@ body: |
; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
+ ;
; GFX10-LABEL: name: global_zextload_i16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
@@ -360,23 +383,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: constant_load_i8_align2
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: constant_load_i8_align2
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (invariant load (s8), align 2, addrspace 4)
S_ENDPGM 0, implicit %1
@@ -392,23 +417,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: constant_load_i16_align2
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: constant_load_i16_align2
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (invariant load (s16), align 2, addrspace 4)
S_ENDPGM 0, implicit %1
@@ -424,23 +451,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: constant_sextload_i8_align2
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: constant_sextload_i8_align2
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8), align 2, addrspace 4)
S_ENDPGM 0, implicit %1
@@ -456,23 +485,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: constant_sextload_i16_align2
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: constant_sextload_i16_align2
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s16), align 2, addrspace 4)
S_ENDPGM 0, implicit %1
@@ -488,23 +519,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: constant_zextload_i8_align2
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: constant_zextload_i8_align2
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s8), align 2, addrspace 4)
S_ENDPGM 0, implicit %1
@@ -520,23 +553,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: constant_zextload_i16_align2
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: constant_zextload_i16_align2
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16), align 2, addrspace 4)
S_ENDPGM 0, implicit %1
@@ -552,23 +587,25 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 3)
- ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3)
+ ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX9-LABEL: name: local_load_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 3)
- ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3)
+ ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
+ ;
; GFX10-LABEL: name: local_load_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 3)
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3)
+ ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 3)
S_ENDPGM 0, implicit %1
@@ -587,6 +624,7 @@ body: |
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5)
; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX9-LABEL: name: private_load_i8_align4
; GFX9: liveins: $sgpr0_sgpr1
; GFX9-NEXT: {{ $}}
@@ -594,6 +632,7 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5)
; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ;
; GFX10-LABEL: name: private_load_i8_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir
index 29db4cf9eedf5..7f48a30b2069f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir
@@ -12,8 +12,7 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 4)
; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 4, align 1)
@@ -31,8 +30,7 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
+ ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 1)
; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 1, align 1)
@@ -50,8 +48,7 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 4)
; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 4, align 2)
@@ -69,8 +66,7 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
+ ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 1)
; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 1, align 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
index a5711418a8000..3b5ec94aeb980 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
--- |
define amdgpu_kernel void @load_constant(ptr addrspace(4) %ptr0) {
@@ -110,8 +110,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.ptr1)
...
@@ -127,8 +127,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.ptr1)
...
@@ -144,8 +144,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (volatile invariant load (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (volatile invariant load (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (volatile invariant load (s32) from %ir.ptr1)
...
@@ -161,8 +161,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load acquire (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load acquire (s32) from %ir.ptr1, addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (invariant load acquire (s32) from %ir.ptr1)
...
@@ -178,8 +178,8 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s32) from %ir.tmp1, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s32) from %ir.tmp1, addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.tmp1)
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
index 084f2400a536e..c82bd6b3a4c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
define void @shuffle_to_extract(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GFX942-LABEL: shuffle_to_extract:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 766b869aabe0f..5e6894b379a42 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding -global-isel | FileCheck --check-prefixes=SI,GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding -global-isel -new-reg-bank-select | FileCheck --check-prefixes=SI,GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding -global-isel -new-reg-bank-select | FileCheck --check-prefixes=CI,GCN,SICIVI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -global-isel -new-reg-bank-select | FileCheck --check-prefixes=VI,GCN,SICIVI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
; SMRD load with an immediate offset.
; GCN-LABEL: {{^}}smrd0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll
new file mode 100644
index 0000000000000..bdd3cfe717aeb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll
@@ -0,0 +1,429 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @store_P0_i8(i8 %a, ptr addrspace(0) %out) {
+; GFX7-LABEL: store_P0_i8:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: flat_store_byte v[1:2], v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P0_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_store_b8 v[1:2], v0
+; GFX12-NEXT: s_endpgm
+ store i8 %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P0_i16(i16 %a, ptr addrspace(0) %out) {
+; GFX7-LABEL: store_P0_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: flat_store_short v[1:2], v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P0_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_store_b16 v[1:2], v0
+; GFX12-NEXT: s_endpgm
+ store i16 %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P0_i32(i32 %a, ptr addrspace(0) %out) {
+; GFX7-LABEL: store_P0_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: flat_store_dword v[1:2], v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P0_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-NEXT: s_endpgm
+ store i32 %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P0_v2i32(<2 x i32> %a, ptr addrspace(0) %out) {
+; GFX7-LABEL: store_P0_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: flat_store_dword v[2:3], v0
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v2
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX7-NEXT: flat_store_dword v[2:3], v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P0_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT: s_endpgm
+ store <2 x i32> %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P0_v3i32(<3 x i32> %a, ptr addrspace(0) %out) {
+; GFX7-LABEL: store_P0_v3i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v3
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX7-NEXT: flat_store_dword v[3:4], v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3
+; GFX7-NEXT: flat_store_dword v[5:6], v1
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P0_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_store_b96 v[3:4], v[0:2]
+; GFX12-NEXT: s_endpgm
+ store <3 x i32> %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P0_v4i32(<4 x i32> %a, ptr addrspace(0) %out) {
+; GFX7-LABEL: store_P0_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 4, v4
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
+; GFX7-NEXT: flat_store_dword v[4:5], v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4
+; GFX7-NEXT: flat_store_dword v[6:7], v1
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT: flat_store_dword v[0:1], v3
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P0_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3]
+; GFX12-NEXT: s_endpgm
+ store <4 x i32> %a, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P1_i8(i8 %a, ptr addrspace(1) %out) {
+; GFX7-LABEL: store_P1_i8:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_byte v0, v[1:2], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P1_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_store_b8 v[1:2], v0, off
+; GFX12-NEXT: s_endpgm
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P1_i16(i16 %a, ptr addrspace(1) %out) {
+; GFX7-LABEL: store_P1_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P1_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_store_b16 v[1:2], v0, off
+; GFX12-NEXT: s_endpgm
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P1_i32(i32 %a, ptr addrspace(1) %out) {
+; GFX7-LABEL: store_P1_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P1_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_store_b32 v[1:2], v0, off
+; GFX12-NEXT: s_endpgm
+ store i32 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P1_v2i32(<2 x i32> %a, ptr addrspace(1) %out) {
+; GFX7-LABEL: store_P1_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P1_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-NEXT: s_endpgm
+ store <2 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P1_v3i32(<3 x i32> %a, ptr addrspace(1) %out) {
+; GFX7-LABEL: store_P1_v3i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx3 v[0:2], v[3:4], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P1_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_store_b96 v[3:4], v[0:2], off
+; GFX12-NEXT: s_endpgm
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P1_v4i32(<4 x i32> %a, ptr addrspace(1) %out) {
+; GFX7-LABEL: store_P1_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P1_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX12-NEXT: s_endpgm
+ store <4 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P3_i8(i8 %a, ptr addrspace(3) %out) {
+; GFX7-LABEL: store_P3_i8:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_write_b8 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P3_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_store_b8 v1, v0
+; GFX12-NEXT: s_endpgm
+ store i8 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P3_i16(i16 %a, ptr addrspace(3) %out) {
+; GFX7-LABEL: store_P3_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_write_b16 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P3_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_store_b16 v1, v0
+; GFX12-NEXT: s_endpgm
+ store i16 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P3_i32(i32 %a, ptr addrspace(3) %out) {
+; GFX7-LABEL: store_P3_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_store_b32 v1, v0
+; GFX12-NEXT: s_endpgm
+ store i32 %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P3_v2i32(<2 x i32> %a, ptr addrspace(3) %out) {
+; GFX7-LABEL: store_P3_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_write_b64 v2, v[0:1]
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P3_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_store_b64 v2, v[0:1]
+; GFX12-NEXT: s_endpgm
+ store <2 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P3_v3i32(<3 x i32> %a, ptr addrspace(3) %out) {
+; GFX7-LABEL: store_P3_v3i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_write_b96 v3, v[0:2]
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P3_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_store_b96 v3, v[0:2]
+; GFX12-NEXT: s_endpgm
+ store <3 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P3_v4i32(<4 x i32> %a, ptr addrspace(3) %out) {
+; GFX7-LABEL: store_P3_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_write_b128 v4, v[0:3]
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P3_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_store_b128 v4, v[0:3]
+; GFX12-NEXT: s_endpgm
+ store <4 x i32> %a, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P5_i8(i8 %a, ptr addrspace(5) %out) {
+; GFX7-LABEL: store_P5_i8:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_u32 s4, s4, s0
+; GFX7-NEXT: s_addc_u32 s5, s5, 0
+; GFX7-NEXT: buffer_store_byte v0, v1, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P5_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_store_b8 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ store i8 %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P5_i16(i16 %a, ptr addrspace(5) %out) {
+; GFX7-LABEL: store_P5_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_u32 s4, s4, s0
+; GFX7-NEXT: s_addc_u32 s5, s5, 0
+; GFX7-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P5_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_store_b16 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ store i16 %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P5_i32(i32 %a, ptr addrspace(5) %out) {
+; GFX7-LABEL: store_P5_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_u32 s4, s4, s0
+; GFX7-NEXT: s_addc_u32 s5, s5, 0
+; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P5_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_endpgm
+ store i32 %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P5_v2i32(<2 x i32> %a, ptr addrspace(5) %out) {
+; GFX7-LABEL: store_P5_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_u32 s4, s4, s0
+; GFX7-NEXT: s_addc_u32 s5, s5, 0
+; GFX7-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2
+; GFX7-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P5_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
+; GFX12-NEXT: s_endpgm
+ store <2 x i32> %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P5_v3i32(<3 x i32> %a, ptr addrspace(5) %out) {
+; GFX7-LABEL: store_P5_v3i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_u32 s4, s4, s0
+; GFX7-NEXT: s_addc_u32 s5, s5, 0
+; GFX7-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3
+; GFX7-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3
+; GFX7-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P5_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_store_b96 v3, v[0:2], off
+; GFX12-NEXT: s_endpgm
+ store <3 x i32> %a, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @store_P5_v4i32(<4 x i32> %a, ptr addrspace(5) %out) {
+; GFX7-LABEL: store_P5_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_u32 s4, s4, s0
+; GFX7-NEXT: s_addc_u32 s5, s5, 0
+; GFX7-NEXT: buffer_store_dword v0, v4, s[4:7], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GFX7-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4
+; GFX7-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4
+; GFX7-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_P5_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_store_b128 v4, v[0:3], off
+; GFX12-NEXT: s_endpgm
+ store <4 x i32> %a, ptr addrspace(5) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 3685eed5043a3..1812e17800e71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
; GFX9-LABEL: store_lds_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index cce6bd9301cbf..db5d0fdb5c036 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
; GFX9-LABEL: store_lds_v3i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll
new file mode 100644
index 0000000000000..3fa4892494998
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @store_uniform_P1_addr_i8(i8 %a, ptr addrspace(1) inreg %out) {
+; GFX7-LABEL: store_uniform_P1_addr_i8:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_uniform_P1_addr_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
+ store i8 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_uniform_P1_addr_i16(i16 %a, ptr addrspace(1) inreg %out) {
+; GFX7-LABEL: store_uniform_P1_addr_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_uniform_P1_addr_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
+ store i16 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_uniform_P1_addr_i32(i32 %a, ptr addrspace(1) inreg %out) {
+; GFX7-LABEL: store_uniform_P1_addr_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_uniform_P1_addr_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
+ store i32 %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_uniform_P1_addr_v2i32(<2 x i32> %a, ptr addrspace(1) inreg %out) {
+; GFX7-LABEL: store_uniform_P1_addr_v2i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_uniform_P1_addr_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
+ store <2 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_uniform_P1_addr_v3i32(<3 x i32> %a, ptr addrspace(1) inreg %out) {
+; GFX7-LABEL: store_uniform_P1_addr_v3i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_uniform_P1_addr_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_endpgm
+ store <3 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @store_uniform_P1_addr_v4i32(<4 x i32> %a, ptr addrspace(1) inreg %out) {
+; GFX7-LABEL: store_uniform_P1_addr_v4i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: store_uniform_P1_addr_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_endpgm
+ store <4 x i32> %a, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll
index dccc55b7d045b..d6e8d0a0a4788 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
+; RUN: not llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
; GISEL-ERR: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_LOAD %{{[0-9]+}}:vgpr(p8) :: (load (s32) from %ir.rsrc, addrspace 8)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index d28840d36ed65..2d3ce9469ee90 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
+
+; FixMe: need to decide to move multiple instructions to vgpr
define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i8_align4:
@@ -338,15 +340,17 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: s_lshr_b32 s2, s2, 16
+; GFX8-NEXT: s_add_u32 s0, s0, 2
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_sextload_i8_align2:
@@ -356,8 +360,11 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_sextload_i8_align2:
@@ -367,8 +374,11 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: s_lshr_b32 s2, s2, 16
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%sextload = sext i8 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
index 9cd9c4734fbe6..4511c364b8a7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define i32 @zextload_global_i1_to_i32(ptr addrspace(1) %ptr) {
; GFX9-LABEL: zextload_global_i1_to_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 4b52d6efb8e98..5dcf5d437bae6 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL
+
+; FixMe: need to decide to move multiple instructions to vgpr that will be folded by isel pattern
define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds1align1:
@@ -43,11 +45,18 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s1, s0
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s1, 8
; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds2align1:
@@ -109,22 +118,35 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
-; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3
-; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
+; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:3
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 24, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:1
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s4, s3
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_and_b32 s3, 0xffff, s0
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s3, s3, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s2, s0, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s0, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds4align1:
@@ -165,10 +187,17 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s1, s1, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s1, s0
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds4align2:
@@ -246,30 +275,56 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v6
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v5
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1
-; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 24, v1
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:3
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6
-; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: s_and_b32 s4, 0xffff, s0
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s4, s4, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s3, s0, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s0, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s2
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:4
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:5
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s2, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:6
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds8align1:
@@ -317,15 +372,28 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
-; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v3
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s4, s3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s3, s0, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:4
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:6
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds8align2:
@@ -426,47 +494,85 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
-; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5
-; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8
-; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9
-; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16
+; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 offset:8
+; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:9
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v6
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v5
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:10
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v8
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2
-; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:3
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v2
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10
-; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4
+; ALIGNED-GISEL-NEXT: s_and_b32 s5, 0xffff, s0
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s5, s5, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s0, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s2
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:4
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:5
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s2, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:6
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s3
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:9
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s3, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:10
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:11
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds12align1:
@@ -521,19 +627,37 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v4
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v3
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s4, 16
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5
-; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6
-; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s2, s3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v5
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s4, s0, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:4
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:6
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:10
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds12align2:
@@ -727,62 +851,114 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v6
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v5
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
-; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8
-; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9
-; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10
-; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11
-; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12
-; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13
-; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v8
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 offset:8
+; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:9
+; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:10
+; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:11
+; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:12
+; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:13
+; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:14
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 8
+; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4
+; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v7
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9
-; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:3
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v2
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v3
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11
-; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13
-; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v0
-; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14
-; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s6, v0
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s6, s6, 24
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s5, s6, s5
+; ALIGNED-GISEL-NEXT: s_and_b32 s6, 0xffff, s0
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s6, s6, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s5, s0, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s0, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s2
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:4
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:5
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s2, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:6
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s3
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:9
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s3, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:10
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s4
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:11
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:12
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:13
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s4, 24
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:14
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:15
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds16align1:
@@ -843,22 +1019,46 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v5
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7
-; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2
-; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6
-; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10
-; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12
-; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v0
+; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3
+; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v7
+; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 16
+; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s5, s0, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s5
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:2
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:4
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:6
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:8
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:10
+; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:12
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:14
; ALIGNED-GISEL-NEXT: s_endpgm
;
; UNALIGNED-LABEL: ds16align2:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll
index 655475c6543e2..75732a58eafc4 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll
@@ -1,5 +1,5 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
; This test makes sure we do not double count global values when they are
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll
index 265353675b349..cc1dd536020a7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -new-reg-bank-select -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM
define void @local_to_flat(ptr addrspace(3) %ptr) {
; ASM-LABEL: local_to_flat:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 50f1beba25227..2439949514ce9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -1,5 +1,5 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
declare i64 @llvm.amdgcn.dispatch.id() #1
diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
index 75e7a63c540e5..7ae57ff8ec276 100644
--- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s
define i32 @range_metadata_sext_i8_signed_range_i32(ptr addrspace(1) %ptr) {
; GCN-LABEL: range_metadata_sext_i8_signed_range_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index a3c38b17abf00..e50ed3ee95140 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -5,12 +5,12 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; Test splitting flat instruction offsets into the low and high bits
; when the offset doesn't fit in the offset field.
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 20916a9a51d9e..ef9d4aff3065e 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll
index f6a5af55840ac..bee329d8ba39a 100644
--- a/llvm/test/CodeGen/AMDGPU/read_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/read_register.ll
@@ -1,5 +1,5 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
declare i32 @llvm.read_register.i32(metadata) #0
declare i64 @llvm.read_register.i64(metadata) #0
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
index 902e3ef5c2397..cdb62ce664677 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GISEL
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GISEL
define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %flag) {
; GCN-LABEL: sink_scratch_pointer:
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 9c7f393d35932..a7affb93c1c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -1,27 +1,27 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; enable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
; disable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
; RUN: llc -global-isel=0 -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index fc323c6e66a3d..7a64e55abb8d3 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
;
More information about the llvm-commits
mailing list