[llvm] [AMDGPU] CodeGen for SMEM instructions (PR #75579)
Mirko BrkuĊĦanin via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 15 02:36:25 PST 2023
https://github.com/mbrkusanin updated https://github.com/llvm/llvm-project/pull/75579
>From 18b7bfeee3dae93890117ac9dcbf0c9d5e2ca57f Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Fri, 15 Dec 2023 10:36:06 +0100
Subject: [PATCH 1/2] [AMDGPU] CodeGen for SMEM instructions
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 12 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 8 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 90 +-
.../Target/AMDGPU/SILoadStoreOptimizer.cpp | 39 +-
llvm/lib/Target/AMDGPU/SMInstructions.td | 14 +-
.../legalize-llvm.amdgcn.s.buffer.load.mir | 247 +-
.../GlobalISel/llvm.amdgcn.s.buffer.load.ll | 1650 ++++++++++++++
.../regbankselect-amdgcn.s.buffer.load.ll | 1506 ++++++++++++
.../AMDGPU/GlobalISel/regbankselect-load.mir | 764 ++++---
...gbankselect-split-scalar-load-metadata.mir | 59 +-
.../AMDGPU/cgp-addressing-modes-smem.ll | 137 +-
llvm/test/CodeGen/AMDGPU/clamp.ll | 10 +-
.../AMDGPU/llvm.amdgcn.permlane16.var.ll | 48 +-
.../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 12 +-
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 92 +-
.../AMDGPU/llvm.amdgcn.s.buffer.load.ll | 273 +++
llvm/test/CodeGen/AMDGPU/load-constant-f32.ll | 26 +
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 39 +
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 2018 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 1329 +++++++++++
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 647 ++++++
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 127 ++
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 1808 +++++++++++++++
llvm/test/CodeGen/AMDGPU/merge-s-load.mir | 162 +-
.../CodeGen/AMDGPU/merge-sbuffer-load.mir | 72 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 4 +-
28 files changed, 10555 insertions(+), 643 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 91e0c86cd365c6..060fb66d38f7bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -828,6 +828,12 @@ def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
"Has Pseudo Scalar Transcendental instructions"
>;
+def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
+ "HasRestrictedSOffset",
+ "true",
+ "Has restricted SOffset (immediate not supported)."
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1474,6 +1480,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
+ FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
@@ -1787,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
+def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
+ AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
+def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
+ AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
+
def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 489b4f5a8d86a5..f3a59109b48219 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -460,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
return false;
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
- // end up widening these for a scalar load during RegBankSelect, since there
- // aren't 96-bit scalar loads.
+ // end up widening these for a scalar load during RegBankSelect, if we don't
+ // have 96-bit scalar loads.
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;
@@ -6467,10 +6467,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
- // There are no 96-bit result scalar loads, but widening to 128-bit should
+ // If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
- if (!isPowerOf2_32(Size)) {
+ if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b2f4c114dcbb3c..b47fafb273442d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
if (DstBank == &AMDGPU::SGPRRegBank) {
// There are some special cases that we need to look at for 32 bit and 96
// bit SGPR loads otherwise we have nothing to do.
- if (LoadSize != 32 && LoadSize != 96)
+ if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a675da8da33989..31a5a99e51bb34 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -199,6 +199,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
bool HasPseudoScalarTrans = false;
+ bool HasRestrictedSOffset = false;
bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
@@ -1163,6 +1164,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
+ bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 708f212e204acf..f4d2c695e317cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1438,11 +1438,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
- } else {
+ } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
// for S_BUFFER_* instructions).
if (!isInt<21>(AM.BaseOffs))
return false;
+ } else {
+ // On GFX12, all offsets are signed 24-bit in bytes.
+ if (!isInt<24>(AM.BaseOffs))
+ return false;
}
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
@@ -7497,7 +7501,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
};
// Widen vec3 load to vec4.
- if (VT.isVector() && VT.getVectorNumElements() == 3) {
+ if (VT.isVector() && VT.getVectorNumElements() == 3 &&
+ !Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
@@ -7913,6 +7918,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// On targets not supporting constant in soffset field, turn zero to
+// SGPR_NULL to avoid generating an extra s_mov with zero.
+static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
+ const GCNSubtarget *Subtarget) {
+ if (Subtarget->hasRestrictedSOffset())
+ if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
+ if (SOffsetConst->isZero()) {
+ return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
+ }
+ }
+ return SOffset;
+}
+
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -7921,13 +7939,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7954,13 +7973,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8116,12 +8136,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8140,12 +8161,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8157,21 +8179,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- Op.getOperand(5), // soffset
- Op.getOperand(6), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // voffset
+ SOffset, // soffset
+ Op.getOperand(6), // offset
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -8187,13 +8210,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy, swizzled buffer
@@ -8213,13 +8237,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8432,6 +8457,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8439,7 +8465,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8454,6 +8480,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8461,7 +8488,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
Op.getOperand(5), // vindex
Offsets.first, // voffset
- Op.getOperand(7), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8893,13 +8920,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
@@ -8920,13 +8948,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -9000,13 +9029,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -9050,13 +9080,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -9404,8 +9435,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
return;
}
}
+
+ SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
+ ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
+ : DAG.getConstant(0, DL, MVT::i32);
+
Offsets[0] = CombinedOffset;
- Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[1] = SOffsetZero;
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}
@@ -9663,7 +9699,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
@@ -9679,7 +9716,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 17105965471f65..dfd84b66bec721 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -161,8 +161,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
if (!AddrOp->isReg())
return false;
- // TODO: We should be able to merge physical reg addresses.
- if (AddrOp->getReg().isPhysical())
+ // TODO: We should be able to merge instructions with other physical reg
+ // addresses too.
+ if (AddrOp->getReg().isPhysical() &&
+ AddrOp->getReg() != AMDGPU::SGPR_NULL)
return false;
// If an address has only one use then there will be no other
@@ -350,6 +352,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::FLAT_LOAD_DWORDX2:
case AMDGPU::FLAT_STORE_DWORDX2:
return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -443,16 +448,19 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
return UNKNOWN;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return S_LOAD_IMM;
@@ -524,16 +532,19 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
return Opc;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return AMDGPU::S_LOAD_DWORD_IMM;
@@ -631,16 +642,19 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
return Result;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
Result.SBase = true;
@@ -967,6 +981,17 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
return false;
if (CI.CPol != Paired.CPol)
return false;
+ if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
+ CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
+ // Reject cases like:
+ // dword + dwordx2 -> dwordx3
+ // dword + dwordx3 -> dwordx4
+ // If we tried to combine these cases, we would fail to extract a subreg
+ // for the result of the second load due to SGPR alignment requirements.
+ if (CI.Width != Paired.Width &&
+ (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
+ return false;
+ }
return true;
}
@@ -1046,6 +1071,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
case 4:
case 8:
return true;
+ case 3:
+ return STM.hasScalarDwordx3Loads();
}
}
}
@@ -1674,6 +1701,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
@@ -1685,6 +1714,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ case 3:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
@@ -1696,6 +1727,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_LOAD_DWORDX4_IMM;
case 8:
@@ -1817,6 +1850,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
return nullptr;
case 2:
return &AMDGPU::SReg_64_XEXECRegClass;
+ case 3:
+ return &AMDGPU::SGPR_96RegClass;
case 4:
return &AMDGPU::SGPR_128RegClass;
case 8:
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index d24bfd535d4ddc..231c897390e5c5 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -826,7 +826,7 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
-multiclass SMRD_Pattern <string Instr, ValueType vt> {
+multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. IMM offset
def : GCNPat <
@@ -835,7 +835,7 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
>;
// 2. 32-bit IMM offset on CI
- def : GCNPat <
+ if immci then def : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
(vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
let OtherPredicates = [isGFX7Only];
@@ -867,7 +867,7 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
>;
}
-multiclass SMLoad_Pattern <string Instr, ValueType vt> {
+multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. Offset as an immediate
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
@@ -876,7 +876,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
}
// 2. 32-bit IMM offset on CI
- def : GCNPat <
+ if immci then def : GCNPat <
(vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
(!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
(extract_cpol $cachepolicy))> {
@@ -919,6 +919,10 @@ foreach vt = SReg_64.RegTypes in {
defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>;
}
+foreach vt = SReg_96.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX3", vt, false>;
+}
+
foreach vt = SReg_128.RegTypes in {
defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
}
@@ -935,12 +939,14 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3i32, false>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3f32, false>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
index 420c55f8f6da21..fb2a548cd79457 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
---
name: s_buffer_load_s32
@@ -28,15 +29,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-LABEL: name: s_buffer_load_v3s32
- ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
- ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
- ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
- ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX67-LABEL: name: s_buffer_load_v3s32
+ ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
+ ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
+ ; GFX67-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v3s32
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
@@ -50,16 +59,25 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-LABEL: name: s_buffer_load_v3p3
- ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
- ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
- ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
- ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
- ; GCN-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>)
+ ; GFX67-LABEL: name: s_buffer_load_v3p3
+ ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
+ ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
+ ; GFX67-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX67-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>)
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v3p3
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<3 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
@@ -73,16 +91,25 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-LABEL: name: s_buffer_load_v6s16
- ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
- ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
- ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
- ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
- ; GCN-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>)
+ ; GFX67-LABEL: name: s_buffer_load_v6s16
+ ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
+ ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
+ ; GFX67-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX67-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>)
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v6s16
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
@@ -140,52 +167,92 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-LABEL: name: s_buffer_load_v12s8
- ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
- ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
- ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
- ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GCN-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
- ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
- ; GCN-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32)
- ; GCN-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
- ; GCN-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
- ; GCN-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
- ; GCN-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
- ; GCN-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
- ; GCN-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
- ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C4]]
- ; GCN-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
- ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
- ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
- ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C2]](s32)
- ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]]
- ; GCN-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
- ; GCN-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]]
- ; GCN-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
- ; GCN-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
- ; GCN-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
- ; GCN-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; GCN-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C2]](s32)
- ; GCN-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL3]]
- ; GCN-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
- ; GCN-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]]
- ; GCN-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
- ; GCN-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
- ; GCN-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
- ; GCN-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
- ; GCN-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LSHR8]], [[C2]](s32)
- ; GCN-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL5]]
- ; GCN-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
- ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
- ; GCN-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
+ ; GFX67-LABEL: name: s_buffer_load_v12s8
+ ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
+ ; GFX67-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; GFX67-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
+ ; GFX67-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX67-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
+ ; GFX67-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+ ; GFX67-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32)
+ ; GFX67-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
+ ; GFX67-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
+ ; GFX67-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
+ ; GFX67-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
+ ; GFX67-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+ ; GFX67-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
+ ; GFX67-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; GFX67-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C4]]
+ ; GFX67-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+ ; GFX67-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+ ; GFX67-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+ ; GFX67-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GFX67-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C2]](s32)
+ ; GFX67-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]]
+ ; GFX67-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+ ; GFX67-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]]
+ ; GFX67-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
+ ; GFX67-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+ ; GFX67-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
+ ; GFX67-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+ ; GFX67-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C2]](s32)
+ ; GFX67-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL3]]
+ ; GFX67-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+ ; GFX67-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]]
+ ; GFX67-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
+ ; GFX67-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
+ ; GFX67-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
+ ; GFX67-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+ ; GFX67-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LSHR8]], [[C2]](s32)
+ ; GFX67-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL5]]
+ ; GFX67-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+ ; GFX67-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
+ ; GFX67-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v12s8
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+ ; GFX12-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32)
+ ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+ ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; GFX12-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
+ ; GFX12-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
+ ; GFX12-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
+ ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
+ ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+ ; GFX12-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+ ; GFX12-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+ ; GFX12-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
+ ; GFX12-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+ ; GFX12-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
+ ; GFX12-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32)
+ ; GFX12-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
+ ; GFX12-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+ ; GFX12-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+ ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+ ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16)
+ ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16)
+ ; GFX12-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
@@ -200,15 +267,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-LABEL: name: s_buffer_load_s96
- ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
- ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
- ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
- ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX67-LABEL: name: s_buffer_load_s96
+ ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
+ ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
+ ; GFX67-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_s96
+ ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
index b621185d83edd7..d31570e47db771 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; FIXME: Merge with regbankselect, which mostly overlaps when all types supported.
@@ -54,6 +55,22 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret i32 %val
}
@@ -106,6 +123,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_glc
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 1 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 1)
ret i32 %val
}
@@ -173,6 +206,27 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v2i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <2 x i32> %val
}
@@ -255,6 +309,31 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v3i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM:%[0-9]+]]:sgpr_96 = S_BUFFER_LOAD_DWORDX3_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
%val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <3 x i32> %val
}
@@ -394,6 +473,51 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
; GFX8-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub7
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec
+ ; GFX12-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]]
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+ ; GFX12-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+ ; GFX12-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+ ; GFX12-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
%val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x i32> %val
}
@@ -629,6 +753,83 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
; GFX8-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec
; GFX8-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s512), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub7
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub8
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub9
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub10
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub11
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub12
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub13
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub14
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub15
+ ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX12-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX12-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX12-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec
+ ; GFX12-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]]
+ ; GFX12-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec
+ ; GFX12-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX12-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec
+ ; GFX12-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX12-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec
+ ; GFX12-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]]
+ ; GFX12-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec
+ ; GFX12-NEXT: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]]
+ ; GFX12-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec
+ ; GFX12-NEXT: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]]
+ ; GFX12-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec
+ ; GFX12-NEXT: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]]
+ ; GFX12-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec
+ ; GFX12-NEXT: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]]
+ ; GFX12-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec
+ ; GFX12-NEXT: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]]
+ ; GFX12-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec
+ ; GFX12-NEXT: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]]
+ ; GFX12-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec
+ ; GFX12-NEXT: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]]
+ ; GFX12-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec
+ ; GFX12-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
%val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x i32> %val
}
@@ -680,6 +881,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_1
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1, i32 0)
ret i32 %val
}
@@ -729,6 +945,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_glc_4
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 4, i32 1)
ret i32 %val
}
@@ -780,6 +1011,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_255
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 255, i32 0)
ret i32 %val
}
@@ -829,6 +1075,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_256
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 256, i32 0)
ret i32 %val
}
@@ -878,6 +1139,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_1020
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1020, i32 0)
ret i32 %val
}
@@ -929,6 +1205,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_1023
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1023, i32 0)
ret i32 %val
}
@@ -979,6 +1270,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_1024
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1024, i32 0)
ret i32 %val
}
@@ -1030,6 +1336,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_1025
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1025, i32 0)
ret i32 %val
}
@@ -1082,6 +1403,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg1
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0)
ret i32 %load
}
@@ -1133,6 +1470,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg4
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0)
ret i32 %load
}
@@ -1184,6 +1537,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg8
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0)
ret i32 %load
}
@@ -1235,6 +1604,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit31
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0)
ret i32 %load
}
@@ -1286,6 +1671,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc)
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_glc_bit30
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 1 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 1)
ret i32 %load
}
@@ -1337,6 +1738,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit29
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0)
ret i32 %load
}
@@ -1388,6 +1805,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit21
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 2097152, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0)
ret i32 %load
}
@@ -1439,6 +1871,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit20
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1048576, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0)
ret i32 %load
}
@@ -1490,6 +1937,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc)
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg_bit20
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0)
ret i32 %load
}
@@ -1540,6 +2003,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) {
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit19
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0)
ret i32 %load
}
@@ -1591,6 +2069,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc)
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg_bit19
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288
+ ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0)
ret i32 %load
}
@@ -1641,6 +2135,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
}
@@ -1699,6 +2208,24 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
; GFX8-NEXT: $vgpr0 = COPY [[COPY5]]
; GFX8-NEXT: $vgpr1 = COPY [[COPY6]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v2f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <2 x float> %val
}
@@ -1766,6 +2293,26 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
; GFX8-NEXT: $vgpr1 = COPY [[COPY6]]
; GFX8-NEXT: $vgpr2 = COPY [[COPY7]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v3f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <3 x float> %val
}
@@ -1836,6 +2383,28 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
; GFX8-NEXT: $vgpr2 = COPY [[COPY7]]
; GFX8-NEXT: $vgpr3 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v4f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub3
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <4 x float> %val
}
@@ -1936,6 +2505,38 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
; GFX8-NEXT: $vgpr6 = COPY [[COPY11]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY12]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
}
@@ -2090,6 +2691,56 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
; GFX8-NEXT: $vgpr14 = COPY [[COPY19]]
; GFX8-NEXT: $vgpr15 = COPY [[COPY20]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub8
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub9
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub10
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub11
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub12
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub13
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub14
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub15
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr8 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr9 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr10 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr11 = COPY [[COPY16]]
+ ; GFX12-NEXT: $vgpr12 = COPY [[COPY17]]
+ ; GFX12-NEXT: $vgpr13 = COPY [[COPY18]]
+ ; GFX12-NEXT: $vgpr14 = COPY [[COPY19]]
+ ; GFX12-NEXT: $vgpr15 = COPY [[COPY20]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
%val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x float> %val
}
@@ -2139,6 +2790,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4092
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -2189,6 +2855,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4095
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -2239,6 +2920,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4096
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -2341,6 +3037,38 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
; GFX8-NEXT: $vgpr6 = COPY [[COPY11]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY12]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4064
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -2443,6 +3171,38 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
; GFX8-NEXT: $vgpr6 = COPY [[COPY11]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY12]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4068
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -2598,6 +3358,56 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
; GFX8-NEXT: $vgpr14 = COPY [[COPY19]]
; GFX8-NEXT: $vgpr15 = COPY [[COPY20]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub8
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub9
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub10
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub11
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub12
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub13
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub14
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub15
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr8 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr9 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr10 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr11 = COPY [[COPY16]]
+ ; GFX12-NEXT: $vgpr12 = COPY [[COPY17]]
+ ; GFX12-NEXT: $vgpr13 = COPY [[COPY18]]
+ ; GFX12-NEXT: $vgpr14 = COPY [[COPY19]]
+ ; GFX12-NEXT: $vgpr15 = COPY [[COPY20]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
%soffset = add i32 %soffset.base, 4032
%val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x float> %val
@@ -2753,6 +3563,56 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
; GFX8-NEXT: $vgpr14 = COPY [[COPY19]]
; GFX8-NEXT: $vgpr15 = COPY [[COPY20]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub8
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub9
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub10
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub11
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub12
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub13
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub14
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub15
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr8 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr9 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr10 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr11 = COPY [[COPY16]]
+ ; GFX12-NEXT: $vgpr12 = COPY [[COPY17]]
+ ; GFX12-NEXT: $vgpr13 = COPY [[COPY18]]
+ ; GFX12-NEXT: $vgpr14 = COPY [[COPY19]]
+ ; GFX12-NEXT: $vgpr15 = COPY [[COPY20]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
%soffset = add i32 %soffset.base, 4036
%val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x float> %val
@@ -2903,6 +3763,54 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
; GFX8-NEXT: bb.5:
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
}
@@ -3046,6 +3954,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
; GFX8-NEXT: bb.5:
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4092
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -3202,6 +4156,56 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
; GFX8-NEXT: bb.5:
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_2]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4096
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -3346,6 +4350,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
; GFX8-NEXT: bb.5:
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0)
ret float %val
}
@@ -3493,6 +4543,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
; GFX8-NEXT: bb.5:
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0)
ret float %val
}
@@ -3688,6 +4784,69 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GFX8-NEXT: $vgpr6 = COPY [[COPY15]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY16]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4064
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -3896,6 +5055,73 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GFX8-NEXT: $vgpr6 = COPY [[COPY16]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY17]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
+ ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_2]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY16]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY17]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4068
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -4102,6 +5328,73 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GFX8-NEXT: $vgpr6 = COPY [[COPY16]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY17]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_2]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY16]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY17]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4096
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -4299,6 +5592,70 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GFX8-NEXT: $vgpr6 = COPY [[COPY15]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY16]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4064
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 936, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 952, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %offset.base, 5000
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -4496,6 +5853,70 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GFX8-NEXT: $vgpr6 = COPY [[COPY15]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY16]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 12
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %offset.base, 4076
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -4693,6 +6114,70 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GFX8-NEXT: $vgpr6 = COPY [[COPY15]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY16]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %offset.base, 4080
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -4887,6 +6372,69 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GFX8-NEXT: $vgpr6 = COPY [[COPY14]]
; GFX8-NEXT: $vgpr7 = COPY [[COPY15]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.3(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
+ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
+ ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7
+ ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX12-NEXT: $vgpr1 = COPY [[COPY9]]
+ ; GFX12-NEXT: $vgpr2 = COPY [[COPY10]]
+ ; GFX12-NEXT: $vgpr3 = COPY [[COPY11]]
+ ; GFX12-NEXT: $vgpr4 = COPY [[COPY12]]
+ ; GFX12-NEXT: $vgpr5 = COPY [[COPY13]]
+ ; GFX12-NEXT: $vgpr6 = COPY [[COPY14]]
+ ; GFX12-NEXT: $vgpr7 = COPY [[COPY15]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0)
ret <8 x float> %val
}
@@ -4936,6 +6484,21 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset = add i32 %offset.v, %offset.s
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
ret float %val
@@ -4986,6 +6549,21 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset = add i32 %offset.s, %offset.v
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
ret float %val
@@ -5045,6 +6623,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.v, %offset.s
%offset = add i32 %offset.base, 1024
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
@@ -5105,6 +6701,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.s, %offset.v
%offset = add i32 %offset.base, 1024
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
@@ -5166,6 +6780,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.s, 1024
%offset = add i32 %offset.base, %offset.v
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
@@ -5226,6 +6858,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.v, 1024
%offset = add i32 %offset.base, %offset.s
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index 2c84b7ccea4015..80bd85d16f357e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7
+; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -simplify-mir -stop-after=amdgpu-regbankselect -o - %s | FileCheck %s -check-prefix=GFX12
; Natural mapping
define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
@@ -19,6 +20,22 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret i32 %val
}
@@ -43,6 +60,26 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v2i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <2 x i32> %val
}
@@ -70,6 +107,29 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v3i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+ ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
%val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <3 x i32> %val
}
@@ -112,6 +172,44 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
; GFX7-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
; GFX7-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+ ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
+ ; GFX12-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+ ; GFX12-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
+ ; GFX12-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
+ ; GFX12-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
+ ; GFX12-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
%val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x i32> %val
}
@@ -178,6 +276,68 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
; GFX7-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
; GFX7-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16i32
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+ ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
+ ; GFX12-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+ ; GFX12-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
+ ; GFX12-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
+ ; GFX12-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
+ ; GFX12-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
+ ; GFX12-NEXT: $sgpr8 = COPY [[INTRINSIC_CONVERGENT8]](s32)
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
+ ; GFX12-NEXT: $sgpr9 = COPY [[INTRINSIC_CONVERGENT9]](s32)
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
+ ; GFX12-NEXT: $sgpr10 = COPY [[INTRINSIC_CONVERGENT10]](s32)
+ ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
+ ; GFX12-NEXT: $sgpr11 = COPY [[INTRINSIC_CONVERGENT11]](s32)
+ ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
+ ; GFX12-NEXT: $sgpr12 = COPY [[INTRINSIC_CONVERGENT12]](s32)
+ ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
+ ; GFX12-NEXT: $sgpr13 = COPY [[INTRINSIC_CONVERGENT13]](s32)
+ ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
+ ; GFX12-NEXT: $sgpr14 = COPY [[INTRINSIC_CONVERGENT14]](s32)
+ ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
+ ; GFX12-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
%val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x i32> %val
}
@@ -199,6 +359,22 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
}
@@ -221,6 +397,24 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v2f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <2 x float> %val
}
@@ -244,6 +438,25 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v3f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<3 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<3 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <3 x float> %val
}
@@ -268,6 +481,26 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v4f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <4 x float> %val
}
@@ -298,6 +531,32 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
}
@@ -338,6 +597,42 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
%val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x float> %val
}
@@ -360,6 +655,24 @@ define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX7-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128)
; GFX7-NEXT: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i96_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s96) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[AMDGPU_BUFFER_LOAD]](s96), [[COPY5]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0)
store i96 %val, ptr addrspace(1) undef
ret void
@@ -389,6 +702,31 @@ define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
; GFX7-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i256_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](s128), [[COPY5]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](s128), [[COPY6]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0)
store i256 %val, ptr addrspace(1) undef
ret void
@@ -426,6 +764,41 @@ define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
; GFX7-NEXT: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_i512_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](s128), [[COPY5]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](s128), [[COPY6]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
+ ; GFX12-NEXT: G_STORE [[UV2]](s128), [[COPY7]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 32, align 8, addrspace 1)
+ ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
+ ; GFX12-NEXT: G_STORE [[UV3]](s128), [[COPY8]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0)
store i512 %val, ptr addrspace(1) undef
ret void
@@ -455,6 +828,31 @@ define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
; GFX7-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16i16_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](<8 x s16>), [[COPY5]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](<8 x s16>), [[COPY6]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
store <16 x i16> %val, ptr addrspace(1) undef
ret void
@@ -492,6 +890,41 @@ define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i
; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
; GFX7-NEXT: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v32i16_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](<8 x s16>), [[COPY5]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](<8 x s16>), [[COPY6]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
+ ; GFX12-NEXT: G_STORE [[UV2]](<8 x s16>), [[COPY7]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
+ ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
+ ; GFX12-NEXT: G_STORE [[UV3]](<8 x s16>), [[COPY8]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
store <32 x i16> %val, ptr addrspace(1) undef
ret void
@@ -521,6 +954,31 @@ define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i3
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
; GFX7-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v4i64_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](<2 x s64>), [[COPY5]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](<2 x s64>), [[COPY6]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
store <4 x i64> %val, ptr addrspace(1) undef
ret void
@@ -558,6 +1016,41 @@ define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i3
; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
; GFX7-NEXT: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8i64_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](<2 x s64>), [[COPY5]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](<2 x s64>), [[COPY6]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
+ ; GFX12-NEXT: G_STORE [[UV2]](<2 x s64>), [[COPY7]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
+ ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
+ ; GFX12-NEXT: G_STORE [[UV3]](<2 x s64>), [[COPY8]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
store <8 x i64> %val, ptr addrspace(1) undef
ret void
@@ -587,6 +1080,31 @@ define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
; GFX7-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v4p1_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](<2 x p1>), [[COPY5]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](<2 x p1>), [[COPY6]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32> %rsrc, i32 %soffset, i32 0)
store <4 x ptr addrspace(1)> %val, ptr addrspace(1) undef
ret void
@@ -624,6 +1142,41 @@ define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32
; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
; GFX7-NEXT: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
; GFX7-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8p1_vgpr_offset
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
+ ; GFX12-NEXT: G_STORE [[UV]](<2 x p1>), [[COPY5]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
+ ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
+ ; GFX12-NEXT: G_STORE [[UV1]](<2 x p1>), [[COPY6]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
+ ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
+ ; GFX12-NEXT: G_STORE [[UV2]](<2 x p1>), [[COPY7]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
+ ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
+ ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
+ ; GFX12-NEXT: G_STORE [[UV3]](<2 x p1>), [[COPY8]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
%val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32> %rsrc, i32 %soffset, i32 0)
store <8 x ptr addrspace(1)> %val, ptr addrspace(1) undef
ret void
@@ -648,6 +1201,25 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4092
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -672,6 +1244,25 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4095
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -695,6 +1286,25 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4096
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -730,6 +1340,35 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4064
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -764,6 +1403,35 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4068
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -808,6 +1476,45 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
%soffset = add i32 %soffset.base, 4032
%val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x float> %val
@@ -851,6 +1558,45 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
%soffset = add i32 %soffset.base, 4036
%val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x float> %val
@@ -903,6 +1649,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
; GFX7-NEXT: bb.5:
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
}
@@ -955,6 +1747,53 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
; GFX7-NEXT: bb.5:
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4092
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -1009,6 +1848,54 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
; GFX7-NEXT: bb.5:
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %16, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 4096
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret float %val
@@ -1061,6 +1948,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
; GFX7-NEXT: bb.5:
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0)
ret float %val
}
@@ -1112,6 +2045,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
; GFX7-NEXT: bb.5:
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4096)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0)
ret float %val
}
@@ -1175,6 +2154,63 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4064
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -1240,6 +2276,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4068
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -1303,6 +2397,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %soffset.base, 4096
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -1365,6 +2517,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 936, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 952, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %offset.base, 5000
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -1427,6 +2637,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %offset.base, 4076
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -1489,6 +2757,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%soffset = add i32 %offset.base, 4080
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x float> %val
@@ -1550,6 +2876,62 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
+ ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
+ ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+ ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
+ ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
+ ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+ ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.4, %bb.2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
+ ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
+ ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
+ ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
+ ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
+ ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
+ ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
+ ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
+ ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0)
ret <8 x float> %val
}
@@ -1572,6 +2954,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
+ ; GFX12-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset = add i32 %offset.v, %offset.s
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
ret float %val
@@ -1595,6 +2995,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
+ ; GFX12-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset = add i32 %offset.s, %offset.v
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
ret float %val
@@ -1622,6 +3040,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.v, %offset.s
%offset = add i32 %offset.base, 1024
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
@@ -1650,6 +3090,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.s, %offset.v
%offset = add i32 %offset.base, 1024
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
@@ -1679,6 +3141,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.s, 1024
%offset = add i32 %offset.base, %offset.v
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
@@ -1707,6 +3191,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+ ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
+ ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
+ ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.v, 1024
%offset = add i32 %offset.base, %offset.s
%val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index 9a42745e76f64e..442902c9fc8f55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -check-prefixes=GCN,GFX7
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s -check-prefixes=GCN,GFX7
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12
--- |
define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) {
@@ -113,16 +114,16 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v8i32_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GCN-LABEL: name: load_global_v8i32_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.global.not.uniform.v8i32)
...
@@ -135,16 +136,16 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v4i64_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+ ; GCN-LABEL: name: load_global_v4i64_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.global.not.uniform.v4i64)
...
@@ -156,22 +157,22 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v16i32_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; GCN-LABEL: name: load_global_v16i32_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.global.not.uniform.v16i32)
...
@@ -183,22 +184,22 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v8i64_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
+ ; GCN-LABEL: name: load_global_v8i64_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.global.not.uniform.v8i64)
...
@@ -210,11 +211,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v8i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s32>), addrspace 1)
+ ; GCN-LABEL: name: load_global_v8i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s32>), addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (invariant load (<8 x s32>), addrspace 1)
...
@@ -226,11 +227,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v4i64_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<4 x s64>), addrspace 1)
+ ; GCN-LABEL: name: load_global_v4i64_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<4 x s64>), addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<4 x s64>) = G_LOAD %0 :: (invariant load (<4 x s64>), addrspace 1)
...
@@ -242,11 +243,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v16i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<16 x s32>), addrspace 1)
+ ; GCN-LABEL: name: load_global_v16i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<16 x s32>), addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<16 x s32>) = G_LOAD %0 :: (invariant load (<16 x s32>), addrspace 1)
...
@@ -258,11 +259,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_global_v8i64_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s64>), addrspace 1)
+ ; GCN-LABEL: name: load_global_v8i64_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s64>), addrspace 1)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<8 x s64>) = G_LOAD %0 :: (invariant load (<8 x s64>), addrspace 1)
...
@@ -274,16 +275,16 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v8i32_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GCN-LABEL: name: load_constant_v8i32_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.constant.not.uniform.v8i32)
...
@@ -295,16 +296,16 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i256_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
- ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128)
+ ; GCN-LABEL: name: load_constant_i256_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
+ ; GCN-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s256) = G_LOAD %0 :: (load (s256) from %ir.constant.not.uniform)
...
@@ -317,16 +318,16 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v16i16_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>)
+ ; GCN-LABEL: name: load_constant_v16i16_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>) from %ir.constant.not.uniform)
...
@@ -338,16 +339,16 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v4i64_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+ ; GCN-LABEL: name: load_constant_v4i64_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.constant.not.uniform.v4i64)
...
@@ -359,22 +360,22 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v16i32_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; GCN-LABEL: name: load_constant_v16i32_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.constant.not.uniform.v16i32)
...
@@ -386,22 +387,22 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v8i64_non_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
+ ; GCN-LABEL: name: load_constant_v8i64_non_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4)
+ ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4)
+ ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.constant.not.uniform.v8i64)
...
@@ -413,11 +414,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v8i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
+ ; GCN-LABEL: name: load_constant_v8i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), addrspace 4)
...
@@ -429,11 +430,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v16i16_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4)
+ ; GCN-LABEL: name: load_constant_v16i16_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>), addrspace 4)
...
@@ -445,11 +446,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v4i64_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4)
+ ; GCN-LABEL: name: load_constant_v4i64_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), addrspace 4)
...
@@ -461,11 +462,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v16i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), addrspace 4)
+ ; GCN-LABEL: name: load_constant_v16i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), addrspace 4)
...
@@ -477,11 +478,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v8i64_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load (<8 x s64>), addrspace 4)
+ ; GCN-LABEL: name: load_constant_v8i64_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load (<8 x s64>), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), addrspace 4)
...
@@ -493,12 +494,12 @@ body: |
bb.0:
liveins: $sgpr0
- ; CHECK-LABEL: name: load_local_uniform
- ; CHECK: liveins: $sgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 3)
+ ; GCN-LABEL: name: load_local_uniform
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 3)
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 3)
@@ -510,12 +511,12 @@ body: |
bb.0:
liveins: $sgpr0
- ; CHECK-LABEL: name: load_region_uniform
- ; CHECK: liveins: $sgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 5)
+ ; GCN-LABEL: name: load_region_uniform
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 5)
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 5)
@@ -528,12 +529,12 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: extload_constant_i8_to_i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; GCN-LABEL: name: extload_constant_i8_to_i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 4, align 1)
...
@@ -546,12 +547,12 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: extload_global_i8_to_i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
+ ; GCN-LABEL: name: extload_global_i8_to_i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1)
...
@@ -564,12 +565,12 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: extload_constant_i16_to_i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; GCN-LABEL: name: extload_constant_i16_to_i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 4, align 2)
...
@@ -582,12 +583,12 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: extload_global_i16_to_i32_uniform
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
+ ; GCN-LABEL: name: extload_global_i16_to_i32_uniform
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2)
...
@@ -599,11 +600,11 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i32_uniform_align4
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
+ ; GCN-LABEL: name: load_constant_i32_uniform_align4
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 4)
...
@@ -616,12 +617,12 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i32_uniform_align2
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4)
+ ; GCN-LABEL: name: load_constant_i32_uniform_align2
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2)
...
@@ -634,12 +635,12 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i32_uniform_align1
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4)
+ ; GCN-LABEL: name: load_constant_i32_uniform_align1
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1)
...
@@ -652,12 +653,12 @@ body: |
bb.0:
liveins: $sgpr0
- ; CHECK-LABEL: name: load_private_uniform_sgpr_i32
- ; CHECK: liveins: $sgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p5) = COPY [[COPY]](p5)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p5) :: (load (s32), addrspace 5)
+ ; GCN-LABEL: name: load_private_uniform_sgpr_i32
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p5) = COPY [[COPY]](p5)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p5) :: (load (s32), addrspace 5)
%0:_(p5) = COPY $sgpr0
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 5, align 4)
...
@@ -671,15 +672,15 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1
- ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash
- ; CHECK: liveins: $vgpr0_vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 32, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GCN-LABEL: name: load_constant_v8i32_vgpr_crash
+ ; GCN: liveins: $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
%0:_(p4) = COPY $vgpr0_vgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), addrspace 4)
...
@@ -690,26 +691,26 @@ legalized: true
tracksRegLiveness: true
body: |
- ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: G_BR %bb.1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[COPY]](p4), %bb.0, %3(p4), %bb.1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load (<4 x s32>), align 32, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
- ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4)
- ; CHECK-NEXT: G_BR %bb.1
+ ; GCN-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: G_BR %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[COPY]](p4), %bb.0, %3(p4), %bb.1
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load (<4 x s32>), align 32, addrspace 4)
+ ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64)
+ ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
+ ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4)
+ ; GCN-NEXT: G_BR %bb.1
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
@@ -732,17 +733,24 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v3i32_align4
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), align 4, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX7-LABEL: name: load_constant_v3i32_align4
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), align 4, addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX12-LABEL: name: load_constant_v3i32_align4
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 4, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 4)
S_ENDPGM 0, implicit %1
@@ -755,17 +763,24 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v3i32_align8
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX7-LABEL: name: load_constant_v3i32_align8
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX12-LABEL: name: load_constant_v3i32_align8
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 8, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 8)
S_ENDPGM 0, implicit %1
@@ -778,14 +793,21 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v3i32_align16
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s32>), addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX7-LABEL: name: load_constant_v3i32_align16
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s32>), addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX12-LABEL: name: load_constant_v3i32_align16
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 16, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 16)
S_ENDPGM 0, implicit %1
@@ -798,18 +820,25 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v6i16_align4
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), align 4, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ; GFX7-LABEL: name: load_constant_v6i16_align4
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), align 4, addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+ ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ;
+ ; GFX12-LABEL: name: load_constant_v6i16_align4
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 4, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 4)
S_ENDPGM 0, implicit %1
@@ -822,18 +851,25 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v6i16_align8
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ; GFX7-LABEL: name: load_constant_v6i16_align8
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+ ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ;
+ ; GFX12-LABEL: name: load_constant_v6i16_align8
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 8, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 8)
S_ENDPGM 0, implicit %1
@@ -846,14 +882,21 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_v6i16_align16
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16), [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16), [[UV6:%[0-9]+]]:sgpr(s16), [[UV7:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ; GFX7-LABEL: name: load_constant_v6i16_align16
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16), [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16), [[UV6:%[0-9]+]]:sgpr(s16), [[UV7:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ;
+ ; GFX12-LABEL: name: load_constant_v6i16_align16
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 16, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 16)
S_ENDPGM 0, implicit %1
@@ -866,17 +909,24 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i96_align4
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), align 4, addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
- ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s96)
+ ; GFX7-LABEL: name: load_constant_i96_align4
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), align 4, addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
+ ; GFX7-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[MV]](s96)
+ ;
+ ; GFX12-LABEL: name: load_constant_i96_align4
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 4, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 4)
S_ENDPGM 0, implicit %1
@@ -889,17 +939,24 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i96_align8
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), addrspace 4)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
- ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s96)
+ ; GFX7-LABEL: name: load_constant_i96_align8
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
+ ; GFX7-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[MV]](s96)
+ ;
+ ; GFX12-LABEL: name: load_constant_i96_align8
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 8, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 8)
S_ENDPGM 0, implicit %1
@@ -912,13 +969,20 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; CHECK-LABEL: name: load_constant_i96_align16
- ; CHECK: liveins: $sgpr0_sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s128) = G_LOAD [[COPY]](p4) :: (invariant load (s128), addrspace 4)
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[LOAD]](s128)
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s96)
+ ; GFX7-LABEL: name: load_constant_i96_align16
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s128) = G_LOAD [[COPY]](p4) :: (invariant load (s128), addrspace 4)
+ ; GFX7-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[LOAD]](s128)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s96)
+ ;
+ ; GFX12-LABEL: name: load_constant_i96_align16
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 16, addrspace 4)
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 16)
S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
index 0ac2dc42b969cc..949ed7946a6b12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -o - | FileCheck -check-prefix=SI %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -o - | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -o - | FileCheck -check-prefix=GFX12 %s
--- |
@@ -28,17 +29,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; SI-LABEL: name: split_smrd_load_range
- ; SI: liveins: $sgpr0_sgpr1
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; SI-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
- ; SI-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; SI-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, addrspace 4)
- ; SI-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
- ; SI-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX7-LABEL: name: split_smrd_load_range
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
+ ; GFX7-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX12-LABEL: name: split_smrd_load_range
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, !range !0, addrspace 4)
+ ; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !range !0)
$sgpr0_sgpr1_sgpr2 = COPY %1
@@ -52,17 +59,23 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; SI-LABEL: name: split_smrd_load_tbaa
- ; SI: liveins: $sgpr0_sgpr1
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; SI-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), !tbaa !2, addrspace 4)
- ; SI-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
- ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; SI-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, !tbaa !2, addrspace 4)
- ; SI-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
- ; SI-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX7-LABEL: name: split_smrd_load_tbaa
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), !tbaa !2, addrspace 4)
+ ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
+ ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, !tbaa !2, addrspace 4)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
+ ; GFX7-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX12-LABEL: name: split_smrd_load_tbaa
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, !tbaa !2, addrspace 4)
+ ; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !tbaa !1)
$sgpr0_sgpr1_sgpr2 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index bfd5dcaa143c17..f5846c3d6db737 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -3,6 +3,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX67,GFX7
; RUN: llc -march=amdgcn -mcpu=gfx801 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX689,GFX89
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX6789,GFX689,GFX89,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12
define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i32 inreg %val) {
; GFX67-LABEL: test_sink_smem_offset_400:
@@ -28,6 +29,19 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX89-NEXT: s_cbranch_scc1 .LBB0_1
; GFX89-NEXT: ; %bb.2: ; %end
; GFX89-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_400:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: .LBB0_1: ; %loop
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190
+; GFX12-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %end
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i64 400
br label %loop
@@ -81,6 +95,19 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX89-NEXT: s_cbranch_scc1 .LBB1_1
; GFX89-NEXT: ; %bb.2: ; %end
; GFX89-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_4000:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: .LBB1_1: ; %loop
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0
+; GFX12-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %end
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i64 4000
br label %loop
@@ -122,6 +149,19 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX7-NEXT: s_cbranch_scc1 .LBB2_1
; GFX7-NEXT: ; %bb.2: ; %end
; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_4000000:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: .LBB2_1: ; %loop
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900
+; GFX12-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %end
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i64 4000000
br label %loop
@@ -137,20 +177,92 @@ end:
ret void
}
+define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %ptr, i32 inreg %val) {
+; GFX689-LABEL: test_sink_smem_offset_40000000:
+; GFX689: ; %bb.0: ; %entry
+; GFX689-NEXT: s_add_u32 s0, s0, 0x2625a00
+; GFX689-NEXT: s_addc_u32 s1, s1, 0
+; GFX689-NEXT: .LBB3_1: ; %loop
+; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX689-NEXT: s_waitcnt lgkmcnt(0)
+; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT: s_add_i32 s2, s2, -1
+; GFX689-NEXT: s_cmp_lg_u32 s2, 0
+; GFX689-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX689-NEXT: ; %bb.2: ; %end
+; GFX689-NEXT: s_endpgm
+;
+; GFX7-LABEL: test_sink_smem_offset_40000000:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: .LBB3_1: ; %loop
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0x989680
+; GFX7-NEXT: s_add_i32 s2, s2, -1
+; GFX7-NEXT: s_cmp_lg_u32 s2, 0
+; GFX7-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX7-NEXT: ; %bb.2: ; %end
+; GFX7-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_40000000:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x2625a00
+; GFX12-NEXT: .LBB3_1: ; %loop
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %end
+; GFX12-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 40000000
+ br label %loop
+
+loop:
+ %count = phi i32 [ %dec, %loop ], [ %val, %entry ]
+ %dec = sub i32 %count, 1
+ %load = load volatile i32, ptr addrspace(4) %gep
+ %cond = icmp eq i32 %dec, 0
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg %ptr, i32 inreg %val) {
; GFX6789-LABEL: test_sink_smem_offset_40000000000:
; GFX6789: ; %bb.0: ; %entry
; GFX6789-NEXT: s_add_u32 s0, s0, 0x502f9000
; GFX6789-NEXT: s_addc_u32 s1, s1, 9
-; GFX6789-NEXT: .LBB3_1: ; %loop
+; GFX6789-NEXT: .LBB4_1: ; %loop
; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6789-NEXT: s_waitcnt lgkmcnt(0)
; GFX6789-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX6789-NEXT: s_add_i32 s2, s2, -1
; GFX6789-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6789-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX6789-NEXT: s_cbranch_scc1 .LBB4_1
; GFX6789-NEXT: ; %bb.2: ; %end
; GFX6789-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_40000000000:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_mov_b32 s4, 0x502f9000
+; GFX12-NEXT: s_mov_b32 s5, 9
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: .LBB4_1: ; %loop
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %end
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i64 40000000000
br label %loop
@@ -171,27 +283,40 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX678: ; %bb.0: ; %entry
; GFX678-NEXT: s_add_u32 s0, s0, 0xfffffe70
; GFX678-NEXT: s_addc_u32 s1, s1, -1
-; GFX678-NEXT: .LBB4_1: ; %loop
+; GFX678-NEXT: .LBB5_1: ; %loop
; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX678-NEXT: s_add_i32 s2, s2, -1
; GFX678-NEXT: s_cmp_lg_u32 s2, 0
-; GFX678-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX678-NEXT: s_cbranch_scc1 .LBB5_1
; GFX678-NEXT: ; %bb.2: ; %end
; GFX678-NEXT: s_endpgm
;
; GFX9-LABEL: test_sink_smem_offset_neg400:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: .LBB4_1: ; %loop
+; GFX9-NEXT: .LBB5_1: ; %loop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190
; GFX9-NEXT: s_add_i32 s2, s2, -1
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sink_smem_offset_neg400:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: .LBB5_1: ; %loop
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %end
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i64 -400
br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 3c0b8f7712e198..b95231fd8880f5 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -4024,14 +4024,12 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x8
+; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_add_f32 s3, s4, s5
-; GFX12-NEXT: s_add_f32 s2, s4, s2
+; GFX12-NEXT: s_add_f32 s2, s4, s5
+; GFX12-NEXT: s_add_f32 s3, s4, s6
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX12-NEXT: s_max_num_f32 s2, s3, s2
+; GFX12-NEXT: s_max_num_f32 s2, s2, s3
; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12
; GFX12-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 131a3951b2bf27..f865418befed7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -41,9 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) {
; GFX12-SDAG-LABEL: v_permlane16var_b32_vi:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
@@ -56,9 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
;
; GFX12-GISEL-LABEL: v_permlane16var_b32_vi:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -76,9 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) {
; GFX12-SDAG-LABEL: v_permlane16var_b32_vl:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
@@ -91,9 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
;
; GFX12-GISEL-LABEL: v_permlane16var_b32_vl:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -111,9 +103,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -125,9 +115,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
;
; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -270,9 +258,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) {
; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
@@ -285,9 +271,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
;
; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -305,9 +289,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) {
; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
@@ -320,9 +302,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
;
; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -340,9 +320,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -354,9 +332,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
;
; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 4e65b376339490..818e8eb9463953 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -91,9 +91,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
;
; VARIANT4-LABEL: test_barrier:
; VARIANT4: ; %bb.0: ; %entry
-; VARIANT4-NEXT: s_clause 0x1
-; VARIANT4-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; VARIANT4-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT4-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2
@@ -115,9 +113,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
;
; VARIANT5-LABEL: test_barrier:
; VARIANT5: ; %bb.0: ; %entry
-; VARIANT5-NEXT: s_clause 0x1
-; VARIANT5-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; VARIANT5-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT5-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2
@@ -139,9 +135,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
;
; VARIANT6-LABEL: test_barrier:
; VARIANT6: ; %bb.0: ; %entry
-; VARIANT6-NEXT: s_clause 0x1
-; VARIANT6-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; VARIANT6-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0
; VARIANT6-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 1ad3e58ce7fc35..220002ce4f6c45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -482,9 +482,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
; GCN-LABEL: test1_s_barrier_init:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -501,9 +499,7 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_init:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -530,9 +526,7 @@ entry:
define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
; GCN-LABEL: test2_s_barrier_init:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -549,9 +543,7 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_init:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -578,9 +570,7 @@ entry:
define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
; GCN-LABEL: test3_s_barrier_init:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -597,9 +587,7 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_init:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -820,37 +808,33 @@ entry:
define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 {
; GCN-LABEL: test4_s_barrier_join_m0:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
-; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
; GCN-NEXT: s_barrier_join m0
-; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
-; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_barrier_join m0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1062,37 +1046,33 @@ entry:
define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 {
; GCN-LABEL: test4_s_wakeup_barrier_m0:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
-; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
; GCN-NEXT: s_wakeup_barrier m0
-; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
-; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1238,36 +1218,32 @@ entry:
define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 {
; GCN-LABEL: test4_s_get_barrier_state_m0:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
-; GCN-NEXT: s_mov_b32 m0, s0
-; GCN-NEXT: s_get_barrier_state s0, m0
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: s_get_barrier_state s2, m0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
-; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, m0
; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 970c2c1c0456e0..ace70aedc33d93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -5,6 +5,7 @@
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX789,GFX8910,GFX89,GFX910,GFX9
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX78910,GFX8910,GFX910,GFX10
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12
define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
; GFX67-LABEL: s_buffer_load_imm:
@@ -30,6 +31,14 @@ define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_imm:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x4
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
%bitcast = bitcast i32 %load to float
@@ -61,6 +70,14 @@ define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %ind
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_index:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast i32 %load to float
@@ -82,6 +99,13 @@ define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_index_divergent:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast i32 %load to float
@@ -116,6 +140,15 @@ define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx2_imm:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x40
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
%bitcast = bitcast <2 x i32> %load to <2 x float>
@@ -152,6 +185,15 @@ define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %i
; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx2_index:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <2 x i32> %load to <2 x float>
@@ -175,6 +217,13 @@ define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx2_index_divergent:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <2 x i32> %load to <2 x float>
@@ -214,6 +263,16 @@ define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) {
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx3_imm:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b96 s[0:2], s[0:3], 0x40
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 64, i32 0)
%bitcast = bitcast <3 x i32> %load to <3 x float>
@@ -254,6 +313,16 @@ define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %i
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx3_index:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b96 s[0:2], s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <3 x i32> %load to <3 x float>
@@ -285,6 +354,13 @@ define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx3_index_divergent:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_load_b96 v[0:2], v0, s[0:3], null offen
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <3 x i32> %load to <3 x float>
@@ -328,6 +404,17 @@ define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
; GFX11-NEXT: v_mov_b32_e32 v3, s3
; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx4_imm:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0xc8
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
%bitcast = bitcast <4 x i32> %load to <4 x float>
@@ -372,6 +459,17 @@ define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %i
; GFX11-NEXT: v_mov_b32_e32 v3, s3
; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx4_index:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <4 x i32> %load to <4 x float>
@@ -397,6 +495,13 @@ define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_loadx4_index_divergent:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
+; GFX12-NEXT: s_endpgm
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
%bitcast = bitcast <4 x i32> %load to <4 x float>
@@ -435,6 +540,15 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_imm_mergex2:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x4
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
%load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
@@ -477,6 +591,17 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
; GFX11-NEXT: v_mov_b32_e32 v3, s3
; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_imm_mergex4:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x8
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
+; GFX12-NEXT: s_endpgm
main_body:
%load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
%load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0)
@@ -594,6 +719,24 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_index_across_bb:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_getpc_b64 s[4:5]
+; GFX12-NEXT: s_add_co_u32 s4, s4, gv at gotpcrel32@lo+4
+; GFX12-NEXT: s_add_co_ci_u32 s5, s5, gv at gotpcrel32@hi+12
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: v_or_b32_e32 v0, 8, v0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
main_body:
%tmp = shl i32 %index, 4
store i32 %tmp, ptr addrspace(1) @gv
@@ -623,6 +766,14 @@ define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %des
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_buffer_load_index_across_bb_merged:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0/*Invalid immediate*/ offen offset:8
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done
+; GFX12-NEXT: s_endpgm
main_body:
%tmp = shl i32 %index, 4
br label %bb1
@@ -667,6 +818,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg1(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_neg1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, -1
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0)
ret i32 %load
}
@@ -706,6 +864,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg4(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_neg4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, -4
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0)
ret i32 %load
}
@@ -745,6 +910,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg8(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_neg8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, -8
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0)
ret i32 %load
}
@@ -784,6 +956,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit31(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_bit31:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_brev_b32 s4, 1
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0)
ret i32 %load
}
@@ -823,6 +1002,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit30(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_bit30:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, 2.0
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 0)
ret i32 %load
}
@@ -862,6 +1048,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit29(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_bit29:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_brev_b32 s4, 4
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0)
ret i32 %load
}
@@ -901,6 +1094,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit21(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_bit21:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x200000
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0)
ret i32 %load
}
@@ -940,6 +1139,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit20(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_bit20:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100000
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0)
ret i32 %load
}
@@ -979,6 +1184,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_neg_bit20:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, 0xfff00000
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0)
ret i32 %load
}
@@ -1009,6 +1221,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit19(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x80000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_bit19:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x80000
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0)
ret i32 %load
}
@@ -1048,6 +1266,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_neg_bit19:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, 0xfff80000
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0)
ret i32 %load
}
@@ -1079,6 +1304,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_255(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0xff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_255:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0xff
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 255, i32 0)
ret i32 %load
}
@@ -1101,6 +1332,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_256(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_256:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 256, i32 0)
ret i32 %load
}
@@ -1123,6 +1360,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1016(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3f8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_1016:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3f8
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1016, i32 0)
ret i32 %load
}
@@ -1145,6 +1388,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1020(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_1020:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fc
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1020, i32 0)
ret i32 %load
}
@@ -1176,6 +1425,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1021(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fd
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_1021:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fd
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1021, i32 0)
ret i32 %load
}
@@ -1206,6 +1461,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1024(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_1024:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0)
ret i32 %load
}
@@ -1237,6 +1498,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1025(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x401
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_1025:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x401
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1025, i32 0)
ret i32 %load
}
@@ -1267,6 +1534,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1028(<4 x i32> inreg %desc) {
; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: s_buffer_load_imm_1028:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0)
ret i32 %load
}
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
index b5f38c641da74f..d299e760b87740 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
; Tests whether a load chain of 8 constants gets vectorized into a wider load.
define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) {
@@ -53,6 +54,31 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur
; EG-NEXT: ADD T0.X, T1.W, PV.W,
; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v8f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0
+; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_add_f32 s0, s0, s12
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_add_f32 s0, s1, s0
+; GFX12-NEXT: s_add_f32 s0, s2, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_add_f32 s0, s3, s0
+; GFX12-NEXT: s_add_f32 s0, s4, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_add_f32 s0, s5, s0
+; GFX12-NEXT: s_add_f32 s0, s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_add_f32 s0, s7, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[10:11]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%out_ptr.promoted = load float, ptr addrspace(1) %out_ptr, align 4
%tmp = load float, ptr addrspace(4) %weights, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 09ef53f3303080..d00044c6ac1ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6-NOHSA %s
; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
; FUNC-LABEL: {{^}}constant_load_f64:
define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
@@ -43,6 +44,19 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load double, ptr addrspace(4) %in
store double %ld, ptr addrspace(1) %out
ret void
@@ -119,6 +133,31 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1]
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NOHSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_2v4f64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0
+; GFX12-NEXT: s_load_b512 s[0:15], s[16:17], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[20:21]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[2:3], v[0:1]
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[4:5], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[6:7], v[0:1]
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[8:9], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[10:11], v[0:1]
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[12:13], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[0:1], s[14:15], v[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[18:19]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4
%tmp = load double, ptr addrspace(4) %weights, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index d100cadb8ee579..4ed4034a0348f4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
; GFX6-LABEL: constant_load_i1:
@@ -61,6 +62,19 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load i1, ptr addrspace(4) %in
store i1 %load, ptr addrspace(1) %out
ret void
@@ -122,6 +136,18 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v2i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
store <2 x i1> %load, ptr addrspace(1) %out
ret void
@@ -182,6 +208,18 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v3i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
store <3 x i1> %load, ptr addrspace(1) %out
ret void
@@ -243,6 +281,18 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v4i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
store <4 x i1> %load, ptr addrspace(1) %out
ret void
@@ -304,6 +354,18 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v8i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
store <8 x i1> %load, ptr addrspace(1) %out
ret void
@@ -365,6 +427,18 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v16i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
store <16 x i1> %load, ptr addrspace(1) %out
ret void
@@ -410,6 +484,18 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v32i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
store <32 x i1> %load, ptr addrspace(1) %out
ret void
@@ -457,6 +543,19 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v64i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
store <64 x i1> %load, ptr addrspace(1) %out
ret void
@@ -508,6 +607,18 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i1_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = zext i1 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -563,6 +674,19 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_i1_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = sext i1 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -615,6 +739,18 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i1_to_v1i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = zext <1 x i1> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -670,6 +806,19 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v1i1_to_v1i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = sext <1 x i1> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -729,6 +878,23 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
; EG-NEXT: AND_INT T0.X, T0.X, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v2i1_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = zext <2 x i1> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -789,6 +955,22 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: BFE_INT * T1.Y, PV.W, 0.0, 1,
+;
+; GFX12-LABEL: constant_sextload_v2i1_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = sext <2 x i1> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -858,6 +1040,27 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v3i1_to_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = zext <3 x i1> %load to <3 x i32>
store <3 x i32> %ext, ptr addrspace(1) %out
@@ -928,6 +1131,24 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: BFE_INT * T3.Y, PV.W, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v3i1_to_v3i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v1, v4, 0, 1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i32>
store <3 x i32> %ext, ptr addrspace(1) %out
@@ -999,6 +1220,31 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: AND_INT T0.X, T0.X, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v4i1_to_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v3, 3, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v5, 1, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v5
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = zext <4 x i1> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1071,6 +1317,27 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T1.Y, PV.W, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v4i1_to_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_bfe_i32 v3, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_bfe_i32 v1, v5, 0, 1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1170,6 +1437,38 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; EG-NEXT: 4(5.605194e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v8, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0
+; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0
+; GFX12-NEXT: v_lshrrev_b16 v3, 6, v0
+; GFX12-NEXT: v_and_b32_e32 v9, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v4, 7, v0
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0
+; GFX12-NEXT: v_and_b32_e32 v10, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v5, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v7
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = zext <8 x i1> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1273,6 +1572,35 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; EG-NEXT: LSHR T5.X, PS, literal.x,
; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v8i1_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v8, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
+; GFX12-NEXT: v_lshrrev_b16 v5, 6, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v7, 7, v0
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: v_bfe_i32 v3, v2, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v6, 0, 1
+; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 1
+; GFX12-NEXT: v_bfe_i32 v6, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v1, v9, 0, 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = sext <8 x i1> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1433,6 +1761,60 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; EG-NEXT: 12(1.681558e-44), 48(6.726233e-44)
; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v16, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0
+; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v15, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 9, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 11, v0
+; GFX12-NEXT: v_and_b32_e32 v17, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v10, 5, v0
+; GFX12-NEXT: v_lshrrev_b16 v12, 7, v0
+; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, v0
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, v0
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, v0
+; GFX12-NEXT: v_and_b32_e32 v22, 1, v13
+; GFX12-NEXT: v_and_b32_e32 v13, 1, v15
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, v0
+; GFX12-NEXT: v_lshrrev_b16 v8, 10, v0
+; GFX12-NEXT: v_lshrrev_b16 v9, 4, v0
+; GFX12-NEXT: v_lshrrev_b16 v11, 6, v0
+; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v19, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v20, 1, v10
+; GFX12-NEXT: v_and_b32_e32 v21, 1, v12
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v14
+; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v5
+; GFX12-NEXT: v_and_b32_e32 v14, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v17
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v11
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v9
+; GFX12-NEXT: v_and_b32_e32 v10, 1, v8
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v7
+; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v19
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v18
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v21
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v20
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = zext <16 x i1> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -1602,6 +1984,53 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; EG-NEXT: LSHR T7.X, PS, literal.x,
; EG-NEXT: BFE_INT * T13.Y, PV.W, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v16i1_to_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v16, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0
+; GFX12-NEXT: v_lshrrev_b16 v8, 14, v0
+; GFX12-NEXT: v_lshrrev_b16 v12, 15, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v13, 7, v0
+; GFX12-NEXT: v_lshrrev_b16 v17, 8, v0
+; GFX12-NEXT: v_lshrrev_b16 v9, 9, v0
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0
+; GFX12-NEXT: v_lshrrev_b16 v11, 11, v0
+; GFX12-NEXT: v_lshrrev_b16 v18, 4, v0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0
+; GFX12-NEXT: v_lshrrev_b16 v19, 1, v0
+; GFX12-NEXT: v_bfe_i32 v3, v2, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v7, 0, 1
+; GFX12-NEXT: v_bfe_i32 v7, v13, 0, 1
+; GFX12-NEXT: v_bfe_i32 v15, v12, 0, 1
+; GFX12-NEXT: v_bfe_i32 v14, v8, 0, 1
+; GFX12-NEXT: v_bfe_i32 v13, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v12, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1
+; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1
+; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v8, v17, 0, 1
+; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v18, 0, 1
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: v_bfe_i32 v1, v19, 0, 1
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = sext <16 x i1> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -1914,6 +2343,92 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; EG-NEXT: 28(3.923636e-44), 112(1.569454e-43)
; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: s_and_b32 s5, s2, 1
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_and_b32_e32 v25, 1, v14
+; GFX12-NEXT: v_and_b32_e32 v26, 1, v18
+; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
+; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
+; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
+; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
+; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
+; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
+; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
+; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
+; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
+; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
+; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v10, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v14, 1, v19
+; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v17
+; GFX12-NEXT: v_and_b32_e32 v18, 1, v16
+; GFX12-NEXT: v_and_b32_e32 v16, 1, v15
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v24
+; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_and_b32 v13, 0xffff, v26
+; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v23
+; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9
+; GFX12-NEXT: v_and_b32_e32 v20, 1, v0
+; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25
+; GFX12-NEXT: v_mov_b32_e32 v25, s2
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22
+; GFX12-NEXT: v_and_b32_e32 v22, 1, v12
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21
+; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = zext <32 x i1> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -2249,6 +2764,89 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; EG-NEXT: LSHR T11.X, PS, literal.x,
; EG-NEXT: BFE_INT * T25.Y, PV.W, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v32i1_to_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v14, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 11, s2
+; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2
+; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
+; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
+; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v22, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
+; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10011
+; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10010
+; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10017
+; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
+; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
+; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
+; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
+; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3
+; GFX12-NEXT: v_bfe_i32 v14, v13, 0, 1
+; GFX12-NEXT: v_bfe_i32 v13, v12, 0, 1
+; GFX12-NEXT: v_bfe_i32 v12, v0, 0, 1
+; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1
+; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1
+; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1
+; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v31, s6
+; GFX12-NEXT: v_mov_b32_e32 v30, s7
+; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 1
+; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_bfe_i32 v23, v22, 0, 1
+; GFX12-NEXT: v_bfe_i32 v22, v21, 0, 1
+; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1
+; GFX12-NEXT: v_bfe_i32 v20, v16, 0, 1
+; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1
+; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
+; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v16, s4
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = sext <32 x i1> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -2840,6 +3438,173 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; EG-NEXT: 28(3.923636e-44), 240(3.363116e-43)
; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2
+; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3
+; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2
+; GFX12-NEXT: v_and_b32_e32 v45, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4
+; GFX12-NEXT: s_lshr_b32 s5, s2, 24
+; GFX12-NEXT: s_and_b32 s6, s3, 1
+; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
+; GFX12-NEXT: v_and_b32_e32 v10, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3
+; GFX12-NEXT: v_and_b32_e32 v43, 1, v4
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
+; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
+; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5
+; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
+; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
+; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5
+; GFX12-NEXT: s_and_b32 s7, s2, 1
+; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5
+; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010
+; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5
+; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017
+; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s5
+; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10015
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v14
+; GFX12-NEXT: v_and_b32_e32 v14, 1, v18
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s5
+; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
+; GFX12-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v32, 1, v11
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s4
+; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
+; GFX12-NEXT: v_and_b32_e32 v20, 1, v16
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s4
+; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
+; GFX12-NEXT: v_and_b32_e32 v24, 1, v15
+; GFX12-NEXT: v_lshrrev_b16 v15, 6, s4
+; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
+; GFX12-NEXT: v_mov_b32_e32 v50, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_and_b32_e32 v28, 1, v13
+; GFX12-NEXT: v_lshrrev_b16 v13, 4, s4
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 1, v4
+; GFX12-NEXT: v_lshrrev_b16 v17, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v41, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v37, 4, s2
+; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
+; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX12-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v52, s12
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX12-NEXT: v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7
+; GFX12-NEXT: v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18
+; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13
+; GFX12-NEXT: v_mov_b32_e32 v51, s3
+; GFX12-NEXT: v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24
+; GFX12-NEXT: v_and_b32_e32 v24, 0xffff, v23
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v22
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v28
+; GFX12-NEXT: v_and_b32_e32 v28, 0xffff, v27
+; GFX12-NEXT: v_and_b32_e32 v27, 1, v26
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v32
+; GFX12-NEXT: v_and_b32_e32 v32, 0xffff, v31
+; GFX12-NEXT: v_and_b32_e32 v31, 1, v30
+; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v36
+; GFX12-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX12-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX12-NEXT: v_dual_mov_b32 v55, s9 :: v_dual_and_b32 v48, 0xffff, v17
+; GFX12-NEXT: v_dual_mov_b32 v17, s6 :: v_dual_and_b32 v20, 0xffff, v20
+; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX12-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v35
+; GFX12-NEXT: v_and_b32_e32 v35, 1, v34
+; GFX12-NEXT: v_and_b32_e32 v34, 0xffff, v40
+; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v39
+; GFX12-NEXT: v_and_b32_e32 v39, 1, v38
+; GFX12-NEXT: v_and_b32_e32 v38, 0xffff, v44
+; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v43
+; GFX12-NEXT: v_and_b32_e32 v43, 1, v41
+; GFX12-NEXT: v_and_b32_e32 v47, 1, v9
+; GFX12-NEXT: v_and_b32_e32 v46, 0xffff, v45
+; GFX12-NEXT: v_and_b32_e32 v45, 1, v1
+; GFX12-NEXT: v_and_b32_e32 v41, 1, v33
+; GFX12-NEXT: v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14
+; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42
+; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX12-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX12-NEXT: v_and_b32_e32 v37, 1, v37
+; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: s_clause 0xd
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1]
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:96
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -3473,6 +4238,160 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; EG-NEXT: LSHR T50.X, PS, literal.x,
; EG-NEXT: BFE_INT * T19.Y, PV.W, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v64i1_to_v64i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s5, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3
+; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
+; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v25, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
+; GFX12-NEXT: v_lshrrev_b16 v27, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
+; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s5
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s5
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, s5
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s5
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s5
+; GFX12-NEXT: v_lshrrev_b16 v44, 7, s5
+; GFX12-NEXT: s_bfe_i32 s5, s3, 0x10018
+; GFX12-NEXT: s_bfe_i32 s6, s3, 0x10000
+; GFX12-NEXT: s_bfe_i32 s13, s3, 0x10013
+; GFX12-NEXT: s_bfe_i32 s14, s3, 0x10012
+; GFX12-NEXT: s_bfe_i32 s15, s3, 0x10011
+; GFX12-NEXT: s_bfe_i32 s16, s3, 0x10010
+; GFX12-NEXT: s_bfe_i32 s17, s3, 0x10017
+; GFX12-NEXT: s_bfe_i32 s18, s3, 0x10016
+; GFX12-NEXT: s_bfe_i32 s19, s3, 0x10014
+; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s3
+; GFX12-NEXT: v_dual_mov_b32 v48, s19 :: v_dual_mov_b32 v51, s17
+; GFX12-NEXT: v_dual_mov_b32 v50, s18 :: v_dual_mov_b32 v53, s15
+; GFX12-NEXT: v_lshrrev_b16 v16, 14, s2
+; GFX12-NEXT: v_dual_mov_b32 v52, s16 :: v_dual_mov_b32 v55, s13
+; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10015
+; GFX12-NEXT: v_mov_b32_e32 v54, s14
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
+; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4
+; GFX12-NEXT: v_lshrrev_b16 v40, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v41, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v42, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v43, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v37, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v39, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v33, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v35, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, s4
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4
+; GFX12-NEXT: v_lshrrev_b16 v11, 3, s4
+; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
+; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10013
+; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10012
+; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10011
+; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10010
+; GFX12-NEXT: s_bfe_i32 s12, s2, 0x10017
+; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10016
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10014
+; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
+; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
+; GFX12-NEXT: v_bfe_i32 v21, v21, 0, 1
+; GFX12-NEXT: v_bfe_i32 v20, v20, 0, 1
+; GFX12-NEXT: v_bfe_i32 v31, v31, 0, 1
+; GFX12-NEXT: v_bfe_i32 v30, v30, 0, 1
+; GFX12-NEXT: v_bfe_i32 v29, v29, 0, 1
+; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s2
+; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s3
+; GFX12-NEXT: v_mov_b32_e32 v53, s10
+; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1
+; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
+; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1
+; GFX12-NEXT: v_bfe_i32 v27, v27, 0, 1
+; GFX12-NEXT: v_bfe_i32 v26, v26, 0, 1
+; GFX12-NEXT: v_bfe_i32 v25, v25, 0, 1
+; GFX12-NEXT: v_bfe_i32 v24, v24, 0, 1
+; GFX12-NEXT: v_bfe_i32 v46, v16, 0, 1
+; GFX12-NEXT: v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v55, s8
+; GFX12-NEXT: v_mov_b32_e32 v54, s9
+; GFX12-NEXT: v_mov_b32_e32 v16, s6
+; GFX12-NEXT: v_bfe_i32 v3, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v1, v7, 0, 1
+; GFX12-NEXT: v_bfe_i32 v7, v44, 0, 1
+; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1
+; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1
+; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1
+; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1
+; GFX12-NEXT: v_bfe_i32 v47, v32, 0, 1
+; GFX12-NEXT: v_bfe_i32 v45, v8, 0, 1
+; GFX12-NEXT: v_bfe_i32 v44, v0, 0, 1
+; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1
+; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1
+; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v43, v43, 0, 1
+; GFX12-NEXT: v_bfe_i32 v42, v42, 0, 1
+; GFX12-NEXT: v_bfe_i32 v41, v41, 0, 1
+; GFX12-NEXT: v_bfe_i32 v40, v40, 0, 1
+; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v39, v39, 0, 1
+; GFX12-NEXT: v_bfe_i32 v38, v38, 0, 1
+; GFX12-NEXT: v_bfe_i32 v37, v37, 0, 1
+; GFX12-NEXT: v_bfe_i32 v36, v36, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: v_bfe_i32 v35, v35, 0, 1
+; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1
+; GFX12-NEXT: v_bfe_i32 v33, v33, 0, 1
+; GFX12-NEXT: v_mov_b32_e32 v32, s7
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v56, v[32:35], s[0:1]
+; GFX12-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:144
+; GFX12-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = sext <64 x i1> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -3530,6 +4449,19 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i1_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = zext i1 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -3588,6 +4520,21 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T0.Y, PV.X,
+;
+; GFX12-LABEL: constant_sextload_i1_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = sext i1 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -3645,6 +4592,19 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i1_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = zext <1 x i1> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -3703,6 +4663,21 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T0.Y, PV.X,
+;
+; GFX12-LABEL: constant_sextload_v1i1_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = sext <1 x i1> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -3768,6 +4743,23 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
; EG-NEXT: MOV T0.W, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v2i1_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = zext <2 x i1> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -3835,6 +4827,25 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.W, T1.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v2i1_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = sext <2 x i1> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -3918,6 +4929,29 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v3i1_to_v3i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v5, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v3, 2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX12-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v6, 1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v6
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = zext <3 x i1> %load to <3 x i64>
store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4005,6 +5039,31 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T1.W, T1.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v3i1_to_v3i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v6, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v6, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i64>
store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4097,6 +5156,34 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v4i1_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v6, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0
+; GFX12-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-NEXT: v_lshrrev_b16 v0, 3, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v2
+; GFX12-NEXT: v_and_b32_e32 v9, 1, v4
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v4, 0xffff, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v9
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = zext <4 x i1> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -4193,6 +5280,35 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; EG-NEXT: MOV T1.W, T1.Z,
; EG-NEXT: MOV * T2.W, T2.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v4i1_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v8, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_bfe_i32 v6, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v2, 0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_bfe_i32 v2, v3, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -4335,6 +5451,41 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v8i1_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
+; GFX12-NEXT: v_lshrrev_b16 v8, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v14, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 7, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0
+; GFX12-NEXT: v_lshrrev_b16 v10, 4, v0
+; GFX12-NEXT: v_and_b32_e32 v17, 1, v4
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v18, 1, v8
+; GFX12-NEXT: v_lshrrev_b16 v16, 2, v0
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v14, 1, v14
+; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v0, 1, v6
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v16
+; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v4, 1, v10
+; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v6, 0xffff, v17
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = zext <8 x i1> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -4489,6 +5640,45 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; EG-NEXT: MOV T5.W, T5.Z,
; EG-NEXT: MOV * T8.W, T8.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v8i1_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v16, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v16, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1
+; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1
+; GFX12-NEXT: v_lshrrev_b16 v7, 4, v1
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, v1
+; GFX12-NEXT: v_lshrrev_b16 v8, 2, v1
+; GFX12-NEXT: v_lshrrev_b16 v9, 5, v1
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1
+; GFX12-NEXT: v_bfe_i32 v14, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v12, v3, 0, 1
+; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v8, 0, 1
+; GFX12-NEXT: v_bfe_i32 v10, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v8, v7, 0, 1
+; GFX12-NEXT: v_bfe_i32 v0, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = sext <8 x i1> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -4738,6 +5928,67 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR * T22.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0
+; GFX12-NEXT: v_lshrrev_b16 v8, 9, v0
+; GFX12-NEXT: v_lshrrev_b16 v12, 13, v0
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0
+; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0
+; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0
+; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0
+; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0
+; GFX12-NEXT: v_and_b32_e32 v33, 1, v4
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, v0
+; GFX12-NEXT: v_lshrrev_b16 v18, 12, v0
+; GFX12-NEXT: v_and_b32_e32 v35, 1, v12
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24
+; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32
+; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10
+; GFX12-NEXT: v_mov_b32_e32 v23, v1
+; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-NEXT: v_mov_b32_e32 v31, v1
+; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0
+; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0
+; GFX12-NEXT: v_and_b32_e32 v37, 1, v20
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v14
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34
+; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v20, 1, v26
+; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v16, 1, v22
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v18
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35
+; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36
+; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32
+; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
+; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = zext <16 x i1> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -5010,6 +6261,73 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: MOV T20.W, T20.Z,
; EG-NEXT: MOV * T14.W, T14.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v16i1_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v32, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v32, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, v1
+; GFX12-NEXT: v_lshrrev_b16 v9, 13, v1
+; GFX12-NEXT: v_lshrrev_b16 v11, 10, v1
+; GFX12-NEXT: v_lshrrev_b16 v13, 11, v1
+; GFX12-NEXT: v_lshrrev_b16 v15, 8, v1
+; GFX12-NEXT: v_lshrrev_b16 v16, 9, v1
+; GFX12-NEXT: v_lshrrev_b16 v12, 6, v1
+; GFX12-NEXT: v_lshrrev_b16 v14, 7, v1
+; GFX12-NEXT: v_lshrrev_b16 v8, 4, v1
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, v1
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, v1
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, v1
+; GFX12-NEXT: v_bfe_i32 v30, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v28, v3, 0, 1
+; GFX12-NEXT: v_bfe_i32 v26, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v24, v7, 0, 1
+; GFX12-NEXT: v_bfe_i32 v22, v13, 0, 1
+; GFX12-NEXT: v_bfe_i32 v20, v11, 0, 1
+; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1
+; GFX12-NEXT: v_bfe_i32 v16, v15, 0, 1
+; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1
+; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1
+; GFX12-NEXT: v_bfe_i32 v0, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v10, 0, 1
+; GFX12-NEXT: v_bfe_i32 v10, v17, 0, 1
+; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v31, 31, v30
+; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX12-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = sext <16 x i1> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -5459,6 +6777,119 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR * T42.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v32i1_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
+; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2
+; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: v_and_b32_e32 v24, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v25, 1, v8
+; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
+; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
+; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2
+; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2
+; GFX12-NEXT: v_and_b32_e32 v26, 1, v15
+; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9
+; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
+; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: s_and_b32 s5, s2, 1
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
+; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v14
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v12
+; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v29
+; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v35, 1, v18
+; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16
+; GFX12-NEXT: v_and_b32_e32 v39, 1, v7
+; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: v_mov_b32_e32 v5, v1
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22
+; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20
+; GFX12-NEXT: v_mov_b32_e32 v20, v1
+; GFX12-NEXT: v_mov_b32_e32 v22, v1
+; GFX12-NEXT: v_mov_b32_e32 v18, v1
+; GFX12-NEXT: v_and_b32_e32 v12, 1, v10
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25
+; GFX12-NEXT: v_mov_b32_e32 v24, v1
+; GFX12-NEXT: s_clause 0x4
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v15, v1
+; GFX12-NEXT: v_mov_b32_e32 v11, v1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1
+; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28
+; GFX12-NEXT: v_mov_b32_e32 v28, v1
+; GFX12-NEXT: s_clause 0x4
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = zext <32 x i1> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -6011,6 +7442,132 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: MOV T40.W, T40.Z,
; EG-NEXT: MOV * T26.W, T26.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v32i1_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 3, s2
+; GFX12-NEXT: s_lshr_b32 s22, s2, 24
+; GFX12-NEXT: s_lshr_b32 s12, s2, 22
+; GFX12-NEXT: s_lshr_b32 s14, s2, 23
+; GFX12-NEXT: v_lshrrev_b16 v6, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, s2
+; GFX12-NEXT: s_lshr_b32 s16, s2, 20
+; GFX12-NEXT: s_lshr_b32 s18, s2, 21
+; GFX12-NEXT: v_lshrrev_b16 v1, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 6, s22
+; GFX12-NEXT: v_lshrrev_b16 v14, 7, s22
+; GFX12-NEXT: v_lshrrev_b16 v9, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s22
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, s22
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-NEXT: s_lshr_b32 s4, s2, 18
+; GFX12-NEXT: v_lshrrev_b16 v37, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 2, s22
+; GFX12-NEXT: v_lshrrev_b16 v15, 3, s22
+; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v49, s12
+; GFX12-NEXT: v_lshrrev_b16 v30, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v32, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 1, s22
+; GFX12-NEXT: v_bfe_i32 v7, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v31, v28, 0, 1
+; GFX12-NEXT: v_bfe_i32 v29, v26, 0, 1
+; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s14
+; GFX12-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s16
+; GFX12-NEXT: s_lshr_b32 s6, s2, 19
+; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
+; GFX12-NEXT: v_bfe_i32 v27, v8, 0, 1
+; GFX12-NEXT: v_bfe_i32 v25, v6, 0, 1
+; GFX12-NEXT: v_dual_mov_b32 v54, s17 :: v_dual_mov_b32 v55, s18
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v56, s19
+; GFX12-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NEXT: s_lshr_b32 s20, s2, 17
+; GFX12-NEXT: v_bfe_i32 v23, v14, 0, 1
+; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 1
+; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 1
+; GFX12-NEXT: v_bfe_i32 v45, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v19, v17, 0, 1
+; GFX12-NEXT: v_bfe_i32 v17, v16, 0, 1
+; GFX12-NEXT: v_bfe_i32 v43, v10, 0, 1
+; GFX12-NEXT: v_bfe_i32 v41, v9, 0, 1
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x10000
+; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1
+; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1
+; GFX12-NEXT: v_bfe_i32 v39, v34, 0, 1
+; GFX12-NEXT: v_bfe_i32 v37, v37, 0, 1
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX12-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX12-NEXT: v_bfe_i32 v35, v32, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
+; GFX12-NEXT: v_bfe_i32 v33, v30, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v30, 31, v29
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:160
+; GFX12-NEXT: v_dual_mov_b32 v49, s4 :: v_dual_mov_b32 v50, s5
+; GFX12-NEXT: v_dual_mov_b32 v51, s6 :: v_dual_mov_b32 v52, s7
+; GFX12-NEXT: v_mov_b32_e32 v53, s10
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[22:23], 0x10000
+; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
+; GFX12-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GFX12-NEXT: v_dual_mov_b32 v54, s11 :: v_dual_mov_b32 v55, s20
+; GFX12-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v1, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s2
+; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GFX12-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GFX12-NEXT: v_ashrrev_i32_e32 v48, 31, v47
+; GFX12-NEXT: v_ashrrev_i32_e32 v46, 31, v45
+; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GFX12-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43
+; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41
+; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GFX12-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39
+; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37
+; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX12-NEXT: v_ashrrev_i32_e32 v36, 31, v35
+; GFX12-NEXT: v_ashrrev_i32_e32 v34, 31, v33
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v10, s3
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:192
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = sext <32 x i1> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -6871,6 +8428,220 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; EG-NEXT: 2(2.802597e-45), 496(6.950440e-43)
; EG-NEXT: LSHR * T82.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v64i1_to_v64i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
+; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
+; GFX12-NEXT: v_and_b32_e32 v34, 1, v4
+; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
+; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
+; GFX12-NEXT: s_lshr_b32 s5, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v3, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v24, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s3
+; GFX12-NEXT: v_and_b32_e32 v50, 1, v14
+; GFX12-NEXT: v_and_b32_e32 v47, 1, v18
+; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
+; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
+; GFX12-NEXT: v_lshrrev_b16 v0, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2
+; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-NEXT: v_and_b32_e32 v42, 1, v8
+; GFX12-NEXT: v_and_b32_e32 v52, 1, v10
+; GFX12-NEXT: v_and_b32_e32 v40, 1, v23
+; GFX12-NEXT: v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24
+; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s5
+; GFX12-NEXT: v_lshrrev_b16 v24, 4, s5
+; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10014
+; GFX12-NEXT: v_and_b32_e32 v33, 1, v25
+; GFX12-NEXT: v_and_b32_e32 v25, 1, v6
+; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10015
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3
+; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5
+; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12
+; GFX12-NEXT: v_lshrrev_b16 v36, 7, s5
+; GFX12-NEXT: v_lshrrev_b16 v37, 6, s5
+; GFX12-NEXT: v_and_b32_e32 v56, 1, v8
+; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v23
+; GFX12-NEXT: v_and_b32_e32 v8, 1, v24
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v25
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v2
+; GFX12-NEXT: v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0
+; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v3
+; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
+; GFX12-NEXT: v_and_b32_e32 v27, 1, v7
+; GFX12-NEXT: v_lshrrev_b16 v9, 10, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 6, s2
+; GFX12-NEXT: v_and_b32_e32 v22, 1, v16
+; GFX12-NEXT: v_lshrrev_b16 v54, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v55, 1, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:96
+; GFX12-NEXT: v_and_b32_e32 v23, 1, v37
+; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v36
+; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34
+; GFX12-NEXT: v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v34, 1, v11
+; GFX12-NEXT: v_dual_mov_b32 v35, v1 :: v_dual_and_b32 v36, 0xffff, v35
+; GFX12-NEXT: v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9
+; GFX12-NEXT: v_mov_b32_e32 v27, v1
+; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:64
+; GFX12-NEXT: v_and_b32_e32 v34, 1, v13
+; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v41
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v54
+; GFX12-NEXT: global_store_b128 v1, v[26:29], s[0:1] offset:80
+; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:48
+; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v2
+; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 14, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s8
+; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v19, 12, s3
+; GFX12-NEXT: v_lshrrev_b16 v32, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: v_mov_b32_e32 v2, s9
+; GFX12-NEXT: v_lshrrev_b16 v39, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v31, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v28, 10, s3
+; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
+; GFX12-NEXT: s_and_b32 s6, s3, 1
+; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s8
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s2
+; GFX12-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT: v_mov_b32_e32 v2, s8
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
+; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015
+; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v43
+; GFX12-NEXT: v_and_b32_e32 v41, 1, v15
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: v_mov_b32_e32 v2, s8
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v17, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v46, 7, s4
+; GFX12-NEXT: v_lshrrev_b16 v49, 6, s4
+; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42
+; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32
+; GFX12-NEXT: v_and_b32_e32 v47, 0xffff, v47
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v2, s8
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4
+; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: s_and_b32 s7, s2, 1
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_and_b32_e32 v51, 1, v17
+; GFX12-NEXT: v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52
+; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: v_mov_b32_e32 v52, v1
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:32
+; GFX12-NEXT: v_and_b32_e32 v41, 1, v49
+; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v46
+; GFX12-NEXT: v_mov_b32_e32 v13, v1
+; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v56
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v46, v1
+; GFX12-NEXT: v_mov_b32_e32 v2, v37
+; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16
+; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX12-NEXT: global_store_b128 v1, v[51:54], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21
+; GFX12-NEXT: v_and_b32_e32 v54, 0xffff, v20
+; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50
+; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v18, 0xffff, v18
+; GFX12-NEXT: v_mov_b32_e32 v51, v1
+; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v27, 1, v39
+; GFX12-NEXT: v_and_b32_e32 v38, 1, v38
+; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v40
+; GFX12-NEXT: v_and_b32_e32 v56, 1, v28
+; GFX12-NEXT: v_and_b32_e32 v58, 0xffff, v22
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:496
+; GFX12-NEXT: global_store_b128 v1, v[52:55], s[0:1] offset:368
+; GFX12-NEXT: global_store_b128 v1, v[48:51], s[0:1] offset:352
+; GFX12-NEXT: v_mov_b32_e32 v41, v1
+; GFX12-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, v36
+; GFX12-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33
+; GFX12-NEXT: v_mov_b32_e32 v32, v1
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v1, v[56:59], s[0:1] offset:336
+; GFX12-NEXT: global_store_b128 v1, v[45:48], s[0:1] offset:320
+; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:304
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, v30
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12
+; GFX12-NEXT: v_mov_b32_e32 v15, v1
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:272
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1
+; GFX12-NEXT: v_mov_b32_e32 v11, v1
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1
+; GFX12-NEXT: v_mov_b32_e32 v30, v1
+; GFX12-NEXT: s_clause 0x4
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:288
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i64>
store <64 x i64> %ext, ptr addrspace(1) %out
@@ -7947,6 +9718,253 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; EG-NEXT: MOV T80.W, T80.Z,
; EG-NEXT: MOV * T50.W, T50.Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s19, s5
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s26, s3, 22
+; GFX12-NEXT: s_lshr_b32 s28, s3, 23
+; GFX12-NEXT: s_lshr_b32 s30, s3, 20
+; GFX12-NEXT: s_lshr_b32 s34, s3, 21
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX12-NEXT: s_lshr_b32 s20, s3, 18
+; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v43, s27
+; GFX12-NEXT: v_dual_mov_b32 v42, s26 :: v_dual_mov_b32 v45, s29
+; GFX12-NEXT: v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
+; GFX12-NEXT: s_lshr_b32 s22, s3, 19
+; GFX12-NEXT: v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v48, s34
+; GFX12-NEXT: s_lshr_b32 s24, s3, 16
+; GFX12-NEXT: s_lshr_b32 s36, s3, 17
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX12-NEXT: s_lshr_b32 s12, s2, 22
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:432
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:416
+; GFX12-NEXT: v_dual_mov_b32 v43, s21 :: v_dual_mov_b32 v42, s20
+; GFX12-NEXT: v_dual_mov_b32 v45, s23 :: v_dual_mov_b32 v44, s22
+; GFX12-NEXT: v_mov_b32_e32 v47, s25
+; GFX12-NEXT: s_lshr_b32 s14, s2, 23
+; GFX12-NEXT: v_dual_mov_b32 v46, s24 :: v_dual_mov_b32 v49, s37
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v48, s36
+; GFX12-NEXT: s_lshr_b32 s16, s2, 20
+; GFX12-NEXT: s_lshr_b32 s40, s2, 21
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-NEXT: s_lshr_b32 s6, s2, 18
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:400
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:384
+; GFX12-NEXT: v_dual_mov_b32 v43, s13 :: v_dual_mov_b32 v42, s12
+; GFX12-NEXT: v_dual_mov_b32 v45, s15 :: v_dual_mov_b32 v44, s14
+; GFX12-NEXT: v_mov_b32_e32 v47, s17
+; GFX12-NEXT: s_lshr_b32 s8, s2, 19
+; GFX12-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v49, s41
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v48, s40
+; GFX12-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 13, s2
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:160
+; GFX12-NEXT: v_dual_mov_b32 v43, s7 :: v_dual_mov_b32 v42, s6
+; GFX12-NEXT: v_dual_mov_b32 v45, s9 :: v_dual_mov_b32 v44, s8
+; GFX12-NEXT: v_mov_b32_e32 v47, s11
+; GFX12-NEXT: s_lshr_b32 s42, s2, 17
+; GFX12-NEXT: v_lshrrev_b16 v32, 10, s2
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v35, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v27, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v29, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v30, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v31, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v24, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v18, 14, s3
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
+; GFX12-NEXT: v_lshrrev_b16 v16, 12, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 13, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 10, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v15, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v22, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v26, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v28, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v36, 1, s3
+; GFX12-NEXT: s_lshr_b32 s18, s3, 24
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_lshr_b32 s38, s2, 24
+; GFX12-NEXT: v_dual_mov_b32 v46, s10 :: v_dual_mov_b32 v49, s43
+; GFX12-NEXT: v_bfe_i32 v52, v5, 0, 1
+; GFX12-NEXT: v_bfe_i32 v50, v3, 0, 1
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v48, s42
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:144
+; GFX12-NEXT: v_bfe_i32 v44, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v42, v7, 0, 1
+; GFX12-NEXT: v_lshrrev_b16 v41, 2, s18
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:128
+; GFX12-NEXT: v_lshrrev_b16 v54, 3, s18
+; GFX12-NEXT: v_lshrrev_b16 v56, 6, s38
+; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52
+; GFX12-NEXT: v_ashrrev_i32_e32 v51, 31, v50
+; GFX12-NEXT: v_ashrrev_i32_e32 v45, 31, v44
+; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42
+; GFX12-NEXT: v_bfe_i32 v46, v56, 0, 1
+; GFX12-NEXT: v_bfe_i32 v56, v54, 0, 1
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:112
+; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:96
+; GFX12-NEXT: v_bfe_i32 v32, v32, 0, 1
+; GFX12-NEXT: v_bfe_i32 v54, v41, 0, 1
+; GFX12-NEXT: v_bfe_i32 v43, v35, 0, 1
+; GFX12-NEXT: v_bfe_i32 v41, v33, 0, 1
+; GFX12-NEXT: v_lshrrev_b16 v39, 4, s18
+; GFX12-NEXT: v_ashrrev_i32_e32 v35, 31, v34
+; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32
+; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43
+; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41
+; GFX12-NEXT: v_lshrrev_b16 v40, 5, s18
+; GFX12-NEXT: v_lshrrev_b16 v37, 6, s18
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:80
+; GFX12-NEXT: v_bfe_i32 v32, v39, 0, 1
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] offset:64
+; GFX12-NEXT: v_bfe_i32 v41, v29, 0, 1
+; GFX12-NEXT: v_bfe_i32 v39, v27, 0, 1
+; GFX12-NEXT: v_bfe_i32 v34, v40, 0, 1
+; GFX12-NEXT: v_bfe_i32 v60, v31, 0, 1
+; GFX12-NEXT: v_bfe_i32 v58, v30, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41
+; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39
+; GFX12-NEXT: v_lshrrev_b16 v38, 7, s18
+; GFX12-NEXT: v_bfe_i32 v62, v37, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v61, 31, v60
+; GFX12-NEXT: v_ashrrev_i32_e32 v59, 31, v58
+; GFX12-NEXT: global_store_b128 v12, v[39:42], s[0:1] offset:48
+; GFX12-NEXT: v_bfe_i32 v39, v25, 0, 1
+; GFX12-NEXT: v_bfe_i32 v37, v24, 0, 1
+; GFX12-NEXT: v_bfe_i32 v64, v38, 0, 1
+; GFX12-NEXT: global_store_b128 v12, v[58:61], s[0:1] offset:32
+; GFX12-NEXT: v_bfe_i32 v43, v23, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39
+; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: v_bfe_i32 v24, v36, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43
+; GFX12-NEXT: v_dual_mov_b32 v41, s2 :: v_dual_mov_b32 v42, s3
+; GFX12-NEXT: v_mov_b32_e32 v23, s5
+; GFX12-NEXT: global_store_b128 v12, v[37:40], s[0:1] offset:16
+; GFX12-NEXT: v_bfe_i32 v38, v20, 0, 1
+; GFX12-NEXT: v_bfe_i32 v36, v18, 0, 1
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1]
+; GFX12-NEXT: v_bfe_i32 v20, v19, 0, 1
+; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38
+; GFX12-NEXT: v_ashrrev_i32_e32 v37, 31, v36
+; GFX12-NEXT: v_lshrrev_b16 v55, 1, s18
+; GFX12-NEXT: v_bfe_i32 v40, v21, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX12-NEXT: v_lshrrev_b16 v9, 7, s38
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s38
+; GFX12-NEXT: v_lshrrev_b16 v8, 2, s38
+; GFX12-NEXT: v_lshrrev_b16 v10, 5, s38
+; GFX12-NEXT: v_lshrrev_b16 v11, 4, s38
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s38
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:368
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:352
+; GFX12-NEXT: v_bfe_i32 v38, v1, 0, 1
+; GFX12-NEXT: v_bfe_i32 v36, v0, 0, 1
+; GFX12-NEXT: v_bfe_i32 v52, v55, 0, 1
+; GFX12-NEXT: v_bfe_i32 v20, v15, 0, 1
+; GFX12-NEXT: v_bfe_i32 v18, v13, 0, 1
+; GFX12-NEXT: v_bfe_i32 v48, v9, 0, 1
+; GFX12-NEXT: v_bfe_i32 v16, v17, 0, 1
+; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1
+; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1
+; GFX12-NEXT: v_bfe_i32 v4, v8, 0, 1
+; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1
+; GFX12-NEXT: v_bfe_i32 v8, v11, 0, 1
+; GFX12-NEXT: v_bfe_i32 v42, v22, 0, 1
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[18:19], 0x10000
+; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1
+; GFX12-NEXT: v_bfe_i32 v26, v26, 0, 1
+; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX12-NEXT: v_ashrrev_i32_e32 v57, 31, v56
+; GFX12-NEXT: v_ashrrev_i32_e32 v55, 31, v54
+; GFX12-NEXT: v_ashrrev_i32_e32 v65, 31, v64
+; GFX12-NEXT: v_ashrrev_i32_e32 v63, 31, v62
+; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38
+; GFX12-NEXT: v_ashrrev_i32_e32 v37, 31, v36
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[38:39], 0x10000
+; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52
+; GFX12-NEXT: v_ashrrev_i32_e32 v35, 31, v34
+; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32
+; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX12-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v51, s9
+; GFX12-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_ashrrev_i32_e32 v49, 31, v48
+; GFX12-NEXT: v_ashrrev_i32_e32 v47, 31, v46
+; GFX12-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42
+; GFX12-NEXT: v_ashrrev_i32_e32 v41, 31, v40
+; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:336
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:320
+; GFX12-NEXT: global_store_b128 v12, v[14:17], s[0:1] offset:304
+; GFX12-NEXT: global_store_b128 v12, v[40:43], s[0:1] offset:288
+; GFX12-NEXT: global_store_b128 v12, v[26:29], s[0:1] offset:272
+; GFX12-NEXT: global_store_b128 v12, v[22:25], s[0:1] offset:256
+; GFX12-NEXT: global_store_b128 v12, v[62:65], s[0:1] offset:496
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:480
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v12, v[54:57], s[0:1] offset:464
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:448
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:240
+; GFX12-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:224
+; GFX12-NEXT: global_store_b128 v12, v[4:7], s[0:1] offset:208
+; GFX12-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = sext <64 x i1> %load to <64 x i64>
store <64 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 5332da6827ec3f..585f96b9ffb2e6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-HSA %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_i16:
@@ -73,6 +74,18 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load i16, ptr addrspace(4) %in
store i16 %ld, ptr addrspace(1) %out
@@ -131,6 +144,18 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v2i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <2 x i16>, ptr addrspace(4) %in
store <2 x i16> %ld, ptr addrspace(1) %out
@@ -224,6 +249,21 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: OR_INT T6.X, PV.W, PS,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v3i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] offset:4
+; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i16>, ptr addrspace(4) %in
store <3 x i16> %ld, ptr addrspace(1) %out
@@ -285,6 +325,19 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v4i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <4 x i16>, ptr addrspace(4) %in
store <4 x i16> %ld, ptr addrspace(1) %out
@@ -352,6 +405,20 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v8i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <8 x i16>, ptr addrspace(4) %in
store <8 x i16> %ld, ptr addrspace(1) %out
@@ -452,6 +519,24 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; EG-NEXT: ALU clause starting at 17:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v16i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <16 x i16>, ptr addrspace(4) %in
store <16 x i16> %ld, ptr addrspace(1) %out
@@ -654,6 +739,36 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: MOV * T2.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v16i16_align2:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v8, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0xf
+; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
+; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
+; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
+; GFX12-NEXT: global_load_u16 v0, v8, s[0:1] offset:16
+; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:12
+; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
+; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
+; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
+; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
+; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
+; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
+; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
+; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
+; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
+; GFX12-NEXT: s_waitcnt vmcnt(4)
+; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <16 x i16>, ptr addrspace(4) %ptr0, align 2
store <16 x i16> %ld, ptr addrspace(1) undef, align 32
@@ -719,6 +834,18 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i16_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %in
%ext = zext i16 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -785,6 +912,18 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_sextload_i16_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %in
%ext = sext i16 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -850,6 +989,18 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i16_to_v1i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i16>, ptr addrspace(4) %in
%ext = zext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -916,6 +1067,18 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_sextload_v1i16_to_v1i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i16>, ptr addrspace(4) %in
%ext = sext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -986,6 +1149,22 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_zextload_v2i16_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s3, s2, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(4) %in
%ext = zext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -1058,6 +1237,22 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_sext_i32_i16 s3, s2
+; GFX12-NEXT: s_ashr_i32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(4) %in
%ext = sext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -1140,6 +1335,22 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; EG-NEXT: LSHR T4.X, T0.W, literal.x,
; EG-NEXT: MOV * T3.Y, T1.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v3i16_to_v3i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_and_b32 s4, s2, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i16>, ptr addrspace(4) %in
%ext = zext <3 x i16> %ld to <3 x i32>
@@ -1226,6 +1437,22 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v3i16_to_v3i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s4, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i16>, ptr addrspace(4) %in
%ext = sext <3 x i16> %ld to <3 x i32>
@@ -1315,6 +1542,25 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_zextload_v4i16_to_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s4, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_and_b32 s5, s2, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(4) %in
%ext = zext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1406,6 +1652,24 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+;
+; GFX12-LABEL: constant_sextload_v4i16_to_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s4, s3, 16
+; GFX12-NEXT: s_ashr_i32 s5, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(4) %in
%ext = sext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1540,6 +1804,32 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR * T10.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v8i16_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s8, s7, 16
+; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX12-NEXT: s_and_b32 s9, s6, 0xffff
+; GFX12-NEXT: s_lshr_b32 s6, s6, 16
+; GFX12-NEXT: s_lshr_b32 s2, s5, 16
+; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
+; GFX12-NEXT: s_lshr_b32 s5, s4, 16
+; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i16>, ptr addrspace(4) %in
%ext = zext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1676,6 +1966,32 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; EG-NEXT: LSHR T10.X, PS, literal.x,
; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+;
+; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s8, s7, 16
+; GFX12-NEXT: s_ashr_i32 s9, s6, 16
+; GFX12-NEXT: s_sext_i32_i16 s6, s6
+; GFX12-NEXT: s_sext_i32_i16 s7, s7
+; GFX12-NEXT: s_ashr_i32 s2, s5, 16
+; GFX12-NEXT: s_ashr_i32 s3, s4, 16
+; GFX12-NEXT: s_sext_i32_i16 s5, s5
+; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i16>, ptr addrspace(4) %in
%ext = sext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1900,6 +2216,46 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T18.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v16i16_to_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s16, s11, 16
+; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
+; GFX12-NEXT: s_and_b32 s17, s10, 0xffff
+; GFX12-NEXT: s_lshr_b32 s10, s10, 16
+; GFX12-NEXT: s_lshr_b32 s14, s9, 16
+; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX12-NEXT: s_lshr_b32 s15, s8, 16
+; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
+; GFX12-NEXT: s_lshr_b32 s12, s7, 16
+; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX12-NEXT: s_lshr_b32 s13, s6, 16
+; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
+; GFX12-NEXT: s_lshr_b32 s2, s5, 16
+; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
+; GFX12-NEXT: s_lshr_b32 s5, s4, 16
+; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
+; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i16>, ptr addrspace(4) %in
%ext = zext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2128,6 +2484,46 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; EG-NEXT: LSHR T12.X, PS, literal.x,
; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+;
+; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s16, s11, 16
+; GFX12-NEXT: s_ashr_i32 s17, s10, 16
+; GFX12-NEXT: s_sext_i32_i16 s10, s10
+; GFX12-NEXT: s_sext_i32_i16 s11, s11
+; GFX12-NEXT: s_ashr_i32 s14, s9, 16
+; GFX12-NEXT: s_ashr_i32 s15, s8, 16
+; GFX12-NEXT: s_sext_i32_i16 s9, s9
+; GFX12-NEXT: s_sext_i32_i16 s8, s8
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
+; GFX12-NEXT: s_ashr_i32 s12, s7, 16
+; GFX12-NEXT: s_ashr_i32 s13, s6, 16
+; GFX12-NEXT: s_sext_i32_i16 s7, s7
+; GFX12-NEXT: s_sext_i32_i16 s6, s6
+; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
+; GFX12-NEXT: s_ashr_i32 s2, s5, 16
+; GFX12-NEXT: s_ashr_i32 s3, s4, 16
+; GFX12-NEXT: s_sext_i32_i16 s5, s5
+; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
+; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i16>, ptr addrspace(4) %in
%ext = sext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2538,6 +2934,76 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; EG-NEXT: LSHR * T34.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v32i16_to_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s33, s15, 16
+; GFX12-NEXT: s_and_b32 s15, s15, 0xffff
+; GFX12-NEXT: s_and_b32 s34, s14, 0xffff
+; GFX12-NEXT: s_lshr_b32 s14, s14, 16
+; GFX12-NEXT: s_lshr_b32 s30, s13, 16
+; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
+; GFX12-NEXT: s_lshr_b32 s31, s12, 16
+; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s14
+; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s33
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
+; GFX12-NEXT: s_lshr_b32 s29, s10, 16
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
+; GFX12-NEXT: s_lshr_b32 s28, s11, 16
+; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
+; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX12-NEXT: s_lshr_b32 s26, s9, 16
+; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX12-NEXT: s_lshr_b32 s27, s8, 16
+; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX12-NEXT: s_lshr_b32 s24, s7, 16
+; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX12-NEXT: s_lshr_b32 s25, s6, 16
+; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
+; GFX12-NEXT: v_mov_b32_e32 v10, s11
+; GFX12-NEXT: s_lshr_b32 s22, s5, 16
+; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX12-NEXT: s_lshr_b32 s23, s4, 16
+; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s8
+; GFX12-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v2, s9
+; GFX12-NEXT: v_mov_b32_e32 v5, s25
+; GFX12-NEXT: s_lshr_b32 s20, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_lshr_b32 s21, s2, 16
+; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s24
+; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v13, s23
+; GFX12-NEXT: s_lshr_b32 s18, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_lshr_b32 s19, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s22
+; GFX12-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s21
+; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s20
+; GFX12-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v21, s19
+; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s18
+; GFX12-NEXT: v_mov_b32_e32 v22, s1
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i16>, ptr addrspace(4) %in
%ext = zext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -2958,6 +3424,76 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; EG-NEXT: LSHR T24.X, PS, literal.x,
; EG-NEXT: BFE_INT * T34.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+;
+; GFX12-LABEL: constant_sextload_v32i16_to_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s33, s15, 16
+; GFX12-NEXT: s_ashr_i32 s34, s14, 16
+; GFX12-NEXT: s_sext_i32_i16 s14, s14
+; GFX12-NEXT: s_sext_i32_i16 s15, s15
+; GFX12-NEXT: s_ashr_i32 s30, s13, 16
+; GFX12-NEXT: s_ashr_i32 s31, s12, 16
+; GFX12-NEXT: s_sext_i32_i16 s13, s13
+; GFX12-NEXT: s_sext_i32_i16 s12, s12
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s34
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s33
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
+; GFX12-NEXT: s_ashr_i32 s29, s10, 16
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
+; GFX12-NEXT: s_ashr_i32 s28, s11, 16
+; GFX12-NEXT: s_sext_i32_i16 s11, s11
+; GFX12-NEXT: s_sext_i32_i16 s10, s10
+; GFX12-NEXT: s_ashr_i32 s26, s9, 16
+; GFX12-NEXT: s_ashr_i32 s27, s8, 16
+; GFX12-NEXT: s_sext_i32_i16 s9, s9
+; GFX12-NEXT: s_sext_i32_i16 s8, s8
+; GFX12-NEXT: s_ashr_i32 s24, s7, 16
+; GFX12-NEXT: s_ashr_i32 s25, s6, 16
+; GFX12-NEXT: s_sext_i32_i16 s7, s7
+; GFX12-NEXT: s_sext_i32_i16 s6, s6
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
+; GFX12-NEXT: v_mov_b32_e32 v10, s11
+; GFX12-NEXT: s_ashr_i32 s22, s5, 16
+; GFX12-NEXT: s_ashr_i32 s23, s4, 16
+; GFX12-NEXT: s_sext_i32_i16 s5, s5
+; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s8
+; GFX12-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v2, s9
+; GFX12-NEXT: v_mov_b32_e32 v5, s25
+; GFX12-NEXT: s_ashr_i32 s20, s3, 16
+; GFX12-NEXT: s_ashr_i32 s21, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s24
+; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v13, s23
+; GFX12-NEXT: s_ashr_i32 s18, s1, 16
+; GFX12-NEXT: s_ashr_i32 s19, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s22
+; GFX12-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s21
+; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s20
+; GFX12-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v21, s19
+; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s18
+; GFX12-NEXT: v_mov_b32_e32 v22, s1
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i16>, ptr addrspace(4) %in
%ext = sext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3751,6 +4287,136 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
; EG-NEXT: LSHR * T66.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v64i16_to_v64i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0
+; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s49, s31, 16
+; GFX12-NEXT: s_lshr_b32 s65, s15, 16
+; GFX12-NEXT: s_lshr_b32 s66, s14, 16
+; GFX12-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX12-NEXT: s_and_b32 s15, s15, 0xffff
+; GFX12-NEXT: s_lshr_b32 s63, s13, 16
+; GFX12-NEXT: s_lshr_b32 s64, s12, 16
+; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
+; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s66
+; GFX12-NEXT: s_lshr_b32 s61, s11, 16
+; GFX12-NEXT: s_lshr_b32 s62, s10, 16
+; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
+; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s65
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s64
+; GFX12-NEXT: s_lshr_b32 s59, s9, 16
+; GFX12-NEXT: s_lshr_b32 s60, s8, 16
+; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s63
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s62
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s61
+; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s60
+; GFX12-NEXT: s_lshr_b32 s57, s7, 16
+; GFX12-NEXT: s_lshr_b32 s58, s6, 16
+; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s59
+; GFX12-NEXT: v_mov_b32_e32 v14, s9
+; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: s_lshr_b32 s55, s5, 16
+; GFX12-NEXT: s_lshr_b32 s56, s4, 16
+; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX12-NEXT: s_lshr_b32 s53, s3, 16
+; GFX12-NEXT: s_lshr_b32 s54, s2, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v1, s58 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s7
+; GFX12-NEXT: v_mov_b32_e32 v5, s56
+; GFX12-NEXT: s_lshr_b32 s51, s1, 16
+; GFX12-NEXT: s_lshr_b32 s52, s0, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s55
+; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s54
+; GFX12-NEXT: s_lshr_b32 s50, s30, 16
+; GFX12-NEXT: s_and_b32 s31, s31, 0xffff
+; GFX12-NEXT: s_and_b32 s30, s30, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s53
+; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s52
+; GFX12-NEXT: s_lshr_b32 s45, s27, 16
+; GFX12-NEXT: s_lshr_b32 s46, s26, 16
+; GFX12-NEXT: s_lshr_b32 s47, s29, 16
+; GFX12-NEXT: s_lshr_b32 s48, s28, 16
+; GFX12-NEXT: s_and_b32 s27, s27, 0xffff
+; GFX12-NEXT: s_and_b32 s26, s26, 0xffff
+; GFX12-NEXT: s_and_b32 s29, s29, 0xffff
+; GFX12-NEXT: s_and_b32 s28, s28, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s51
+; GFX12-NEXT: v_dual_mov_b32 v14, s1 :: v_dual_mov_b32 v17, s50
+; GFX12-NEXT: s_lshr_b32 s43, s25, 16
+; GFX12-NEXT: s_lshr_b32 s44, s24, 16
+; GFX12-NEXT: s_and_b32 s25, s25, 0xffff
+; GFX12-NEXT: s_and_b32 s24, s24, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v16, s30 :: v_dual_mov_b32 v19, s49
+; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v21, s48
+; GFX12-NEXT: s_lshr_b32 s41, s23, 16
+; GFX12-NEXT: s_lshr_b32 s42, s22, 16
+; GFX12-NEXT: s_and_b32 s23, s23, 0xffff
+; GFX12-NEXT: s_and_b32 s22, s22, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v23, s47
+; GFX12-NEXT: v_mov_b32_e32 v22, s29
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s26
+; GFX12-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v2, s27
+; GFX12-NEXT: v_mov_b32_e32 v5, s44
+; GFX12-NEXT: s_lshr_b32 s39, s21, 16
+; GFX12-NEXT: s_lshr_b32 s40, s20, 16
+; GFX12-NEXT: s_and_b32 s21, s21, 0xffff
+; GFX12-NEXT: s_and_b32 s20, s20, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s43
+; GFX12-NEXT: v_dual_mov_b32 v6, s25 :: v_dual_mov_b32 v9, s42
+; GFX12-NEXT: s_lshr_b32 s35, s19, 16
+; GFX12-NEXT: s_lshr_b32 s38, s18, 16
+; GFX12-NEXT: s_and_b32 s19, s19, 0xffff
+; GFX12-NEXT: s_and_b32 s18, s18, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s41
+; GFX12-NEXT: v_dual_mov_b32 v10, s23 :: v_dual_mov_b32 v13, s40
+; GFX12-NEXT: s_lshr_b32 s33, s17, 16
+; GFX12-NEXT: s_lshr_b32 s34, s16, 16
+; GFX12-NEXT: s_and_b32 s17, s17, 0xffff
+; GFX12-NEXT: s_and_b32 s16, s16, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v15, s39
+; GFX12-NEXT: v_dual_mov_b32 v14, s21 :: v_dual_mov_b32 v17, s38
+; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s35
+; GFX12-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v21, s34
+; GFX12-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v23, s33
+; GFX12-NEXT: v_mov_b32_e32 v22, s17
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i16>, ptr addrspace(4) %in
%ext = zext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4560,6 +5226,136 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; EG-NEXT: LSHR T48.X, PS, literal.x,
; EG-NEXT: BFE_INT * T66.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+;
+; GFX12-LABEL: constant_sextload_v64i16_to_v64i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40
+; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s65, s15, 16
+; GFX12-NEXT: s_ashr_i32 s66, s14, 16
+; GFX12-NEXT: s_sext_i32_i16 s14, s14
+; GFX12-NEXT: s_sext_i32_i16 s15, s15
+; GFX12-NEXT: s_ashr_i32 s63, s13, 16
+; GFX12-NEXT: s_ashr_i32 s64, s12, 16
+; GFX12-NEXT: s_sext_i32_i16 s13, s13
+; GFX12-NEXT: s_sext_i32_i16 s12, s12
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s66
+; GFX12-NEXT: s_ashr_i32 s61, s11, 16
+; GFX12-NEXT: s_ashr_i32 s62, s10, 16
+; GFX12-NEXT: s_sext_i32_i16 s11, s11
+; GFX12-NEXT: s_sext_i32_i16 s10, s10
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s65
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s64
+; GFX12-NEXT: s_ashr_i32 s59, s9, 16
+; GFX12-NEXT: s_ashr_i32 s60, s8, 16
+; GFX12-NEXT: s_sext_i32_i16 s9, s9
+; GFX12-NEXT: s_sext_i32_i16 s8, s8
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s63
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s62
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s61
+; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s60
+; GFX12-NEXT: s_ashr_i32 s57, s7, 16
+; GFX12-NEXT: s_ashr_i32 s58, s6, 16
+; GFX12-NEXT: s_sext_i32_i16 s7, s7
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s59
+; GFX12-NEXT: v_mov_b32_e32 v14, s9
+; GFX12-NEXT: s_sext_i32_i16 s6, s6
+; GFX12-NEXT: s_ashr_i32 s55, s5, 16
+; GFX12-NEXT: s_ashr_i32 s56, s4, 16
+; GFX12-NEXT: s_sext_i32_i16 s5, s5
+; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: s_ashr_i32 s53, s3, 16
+; GFX12-NEXT: s_ashr_i32 s54, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v1, s58 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s7
+; GFX12-NEXT: v_mov_b32_e32 v5, s56
+; GFX12-NEXT: s_ashr_i32 s51, s1, 16
+; GFX12-NEXT: s_ashr_i32 s52, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s55
+; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s54
+; GFX12-NEXT: s_ashr_i32 s49, s31, 16
+; GFX12-NEXT: s_ashr_i32 s50, s30, 16
+; GFX12-NEXT: s_sext_i32_i16 s31, s31
+; GFX12-NEXT: s_sext_i32_i16 s30, s30
+; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s53
+; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s52
+; GFX12-NEXT: s_ashr_i32 s45, s27, 16
+; GFX12-NEXT: s_ashr_i32 s46, s26, 16
+; GFX12-NEXT: s_sext_i32_i16 s27, s27
+; GFX12-NEXT: s_sext_i32_i16 s26, s26
+; GFX12-NEXT: s_ashr_i32 s47, s29, 16
+; GFX12-NEXT: s_ashr_i32 s48, s28, 16
+; GFX12-NEXT: s_sext_i32_i16 s29, s29
+; GFX12-NEXT: s_sext_i32_i16 s28, s28
+; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s51
+; GFX12-NEXT: v_dual_mov_b32 v14, s1 :: v_dual_mov_b32 v17, s50
+; GFX12-NEXT: s_ashr_i32 s43, s25, 16
+; GFX12-NEXT: s_ashr_i32 s44, s24, 16
+; GFX12-NEXT: s_sext_i32_i16 s25, s25
+; GFX12-NEXT: s_sext_i32_i16 s24, s24
+; GFX12-NEXT: v_dual_mov_b32 v16, s30 :: v_dual_mov_b32 v19, s49
+; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v21, s48
+; GFX12-NEXT: s_ashr_i32 s41, s23, 16
+; GFX12-NEXT: s_ashr_i32 s42, s22, 16
+; GFX12-NEXT: s_sext_i32_i16 s23, s23
+; GFX12-NEXT: s_sext_i32_i16 s22, s22
+; GFX12-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v23, s47
+; GFX12-NEXT: v_mov_b32_e32 v22, s29
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s26
+; GFX12-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v2, s27
+; GFX12-NEXT: v_mov_b32_e32 v5, s44
+; GFX12-NEXT: s_ashr_i32 s39, s21, 16
+; GFX12-NEXT: s_ashr_i32 s40, s20, 16
+; GFX12-NEXT: s_sext_i32_i16 s21, s21
+; GFX12-NEXT: s_sext_i32_i16 s20, s20
+; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s43
+; GFX12-NEXT: v_dual_mov_b32 v6, s25 :: v_dual_mov_b32 v9, s42
+; GFX12-NEXT: s_ashr_i32 s35, s19, 16
+; GFX12-NEXT: s_ashr_i32 s38, s18, 16
+; GFX12-NEXT: s_sext_i32_i16 s19, s19
+; GFX12-NEXT: s_sext_i32_i16 s18, s18
+; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s41
+; GFX12-NEXT: v_dual_mov_b32 v10, s23 :: v_dual_mov_b32 v13, s40
+; GFX12-NEXT: s_ashr_i32 s33, s17, 16
+; GFX12-NEXT: s_ashr_i32 s34, s16, 16
+; GFX12-NEXT: s_sext_i32_i16 s17, s17
+; GFX12-NEXT: s_sext_i32_i16 s16, s16
+; GFX12-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v15, s39
+; GFX12-NEXT: v_dual_mov_b32 v14, s21 :: v_dual_mov_b32 v17, s38
+; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s35
+; GFX12-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v21, s34
+; GFX12-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v23, s33
+; GFX12-NEXT: v_mov_b32_e32 v22, s17
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i16>, ptr addrspace(4) %in
%ext = sext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4630,6 +5426,19 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i16_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %in
%ext = zext i16 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4707,6 +5516,21 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_i16_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %in
%ext = sext i16 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4777,6 +5601,19 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i16_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i16>, ptr addrspace(4) %in
%ext = zext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4849,6 +5686,21 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v1i16_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i16>, ptr addrspace(4) %in
%ext = sext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4927,6 +5779,23 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; EG-NEXT: MOV T4.W, 0.0,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_zextload_v2i16_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(4) %in
%ext = zext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -5010,6 +5879,23 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T4.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v2i16_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(4) %in
%ext = sext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -5120,6 +6006,28 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v4i16_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s4, 0xffff, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
+; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(4) %in
%ext = zext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5245,6 +6153,31 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; EG-NEXT: LSHR T8.X, PV.W, literal.x,
; EG-NEXT: ASHR * T7.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: s_lshr_b32 s8, s3, 16
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s9
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(4) %in
%ext = sext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5421,6 +6354,37 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v8i16_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
+; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
+; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i16>, ptr addrspace(4) %in
%ext = zext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5631,6 +6595,44 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; EG-NEXT: LSHR T7.X, PV.W, literal.x,
; EG-NEXT: ASHR * T14.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s14, s7
+; GFX12-NEXT: s_lshr_b32 s16, s7, 16
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
+; GFX12-NEXT: s_lshr_b32 s6, s6, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
+; GFX12-NEXT: s_mov_b32 s8, s5
+; GFX12-NEXT: s_lshr_b32 s10, s5, 16
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
+; GFX12-NEXT: s_lshr_b32 s4, s4, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v9, s15
+; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17
+; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s9
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s11
+; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s5
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i16>, ptr addrspace(4) %in
%ext = sext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5940,6 +6942,58 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v16i16_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s10, s5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
+; GFX12-NEXT: s_lshr_b32 s5, s5, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s5, s4, 16
+; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshr_b32 s4, s7, 16
+; GFX12-NEXT: s_and_b32 s5, s7, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_lshr_b32 s4, s6, 16
+; GFX12-NEXT: s_and_b32 s5, s6, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_lshr_b32 s4, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_lshr_b32 s3, s2, 16
+; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: s_lshr_b32 s2, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i16>, ptr addrspace(4) %in
%ext = zext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6324,6 +7378,71 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
; EG-NEXT: ASHR * T26.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v16i16_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_mov_b32 s30, s5
+; GFX12-NEXT: s_lshr_b32 s34, s5, 16
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000
+; GFX12-NEXT: s_lshr_b32 s4, s4, 16
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000
+; GFX12-NEXT: s_mov_b32 s24, s7
+; GFX12-NEXT: s_lshr_b32 s26, s7, 16
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000
+; GFX12-NEXT: s_lshr_b32 s6, s6, 16
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s29
+; GFX12-NEXT: s_mov_b32 s18, s3
+; GFX12-NEXT: s_lshr_b32 s20, s3, 16
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
+; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
+; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s12, s0, 16
+; GFX12-NEXT: s_mov_b32 s14, s1
+; GFX12-NEXT: s_lshr_b32 s16, s1, 16
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
+; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
+; GFX12-NEXT: v_mov_b32_e32 v18, s20
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
+; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
+; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
+; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i16>, ptr addrspace(4) %in
%ext = sext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6904,6 +8023,98 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v32i16_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s18, s15, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s18
+; GFX12-NEXT: s_lshr_b32 s15, s15, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s15, s14, 16
+; GFX12-NEXT: s_and_b32 s14, s14, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:240
+; GFX12-NEXT: v_mov_b32_e32 v0, s14
+; GFX12-NEXT: v_mov_b32_e32 v2, s15
+; GFX12-NEXT: s_lshr_b32 s14, s13, 16
+; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224
+; GFX12-NEXT: v_mov_b32_e32 v0, s13
+; GFX12-NEXT: v_mov_b32_e32 v2, s14
+; GFX12-NEXT: s_lshr_b32 s13, s12, 16
+; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:208
+; GFX12-NEXT: v_mov_b32_e32 v0, s12
+; GFX12-NEXT: v_mov_b32_e32 v2, s13
+; GFX12-NEXT: s_lshr_b32 s12, s11, 16
+; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s12
+; GFX12-NEXT: s_lshr_b32 s11, s10, 16
+; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:176
+; GFX12-NEXT: v_mov_b32_e32 v0, s10
+; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: s_lshr_b32 s10, s9, 16
+; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s9
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s9, s8, 16
+; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: v_mov_b32_e32 v2, s9
+; GFX12-NEXT: s_lshr_b32 s8, s7, 16
+; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s8
+; GFX12-NEXT: s_lshr_b32 s7, s6, 16
+; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s7
+; GFX12-NEXT: s_lshr_b32 s6, s5, 16
+; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s5, s4, 16
+; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshr_b32 s4, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_lshr_b32 s3, s2, 16
+; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: s_lshr_b32 s2, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i16>, ptr addrspace(4) %in
%ext = zext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -7640,6 +8851,124 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ASHR * T50.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v32i16_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s28, s2, 16
+; GFX12-NEXT: s_lshr_b32 s42, s5, 16
+; GFX12-NEXT: s_lshr_b32 s52, s8, 16
+; GFX12-NEXT: s_mov_b32 s60, s11
+; GFX12-NEXT: s_lshr_b32 s22, s0, 16
+; GFX12-NEXT: s_mov_b32 s24, s1
+; GFX12-NEXT: s_lshr_b32 s26, s1, 16
+; GFX12-NEXT: s_mov_b32 s30, s3
+; GFX12-NEXT: s_lshr_b32 s36, s3, 16
+; GFX12-NEXT: s_lshr_b32 s38, s4, 16
+; GFX12-NEXT: s_mov_b32 s40, s5
+; GFX12-NEXT: s_lshr_b32 s46, s6, 16
+; GFX12-NEXT: s_mov_b32 s48, s7
+; GFX12-NEXT: s_lshr_b32 s50, s7, 16
+; GFX12-NEXT: s_mov_b32 s54, s9
+; GFX12-NEXT: s_lshr_b32 s56, s9, 16
+; GFX12-NEXT: s_bfe_i64 s[44:45], s[10:11], 0x100000
+; GFX12-NEXT: s_lshr_b32 s58, s10, 16
+; GFX12-NEXT: s_lshr_b32 s62, s11, 16
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[52:53], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[52:53], s[60:61], 0x100000
+; GFX12-NEXT: s_lshr_b32 s60, s14, 16
+; GFX12-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000
+; GFX12-NEXT: s_mov_b32 s14, s15
+; GFX12-NEXT: s_lshr_b32 s66, s15, 16
+; GFX12-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[30:31], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[36:37], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[38:39], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[40:41], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[46:47], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[48:49], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[50:51], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[46:47], s[54:55], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[48:49], s[56:57], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[50:51], s[58:59], 0x100000
+; GFX12-NEXT: s_lshr_b32 s54, s12, 16
+; GFX12-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000
+; GFX12-NEXT: s_mov_b32 s12, s13
+; GFX12-NEXT: s_lshr_b32 s58, s13, 16
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s14
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s66
+; GFX12-NEXT: v_dual_mov_b32 v4, s67 :: v_dual_mov_b32 v5, s64
+; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v6, s65 :: v_dual_mov_b32 v7, s60
+; GFX12-NEXT: v_dual_mov_b32 v8, s61 :: v_dual_mov_b32 v9, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s58
+; GFX12-NEXT: v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v13, s56
+; GFX12-NEXT: v_dual_mov_b32 v14, s57 :: v_dual_mov_b32 v15, s54
+; GFX12-NEXT: v_mov_b32_e32 v16, s55
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[16:17] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[16:17] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[16:17] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[16:17] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v1, s52 :: v_dual_mov_b32 v2, s53
+; GFX12-NEXT: v_dual_mov_b32 v3, s12 :: v_dual_mov_b32 v4, s13
+; GFX12-NEXT: v_dual_mov_b32 v5, s44 :: v_dual_mov_b32 v6, s45
+; GFX12-NEXT: v_dual_mov_b32 v7, s50 :: v_dual_mov_b32 v8, s51
+; GFX12-NEXT: v_dual_mov_b32 v9, s46 :: v_dual_mov_b32 v10, s47
+; GFX12-NEXT: v_dual_mov_b32 v11, s48 :: v_dual_mov_b32 v12, s49
+; GFX12-NEXT: v_dual_mov_b32 v13, s34 :: v_dual_mov_b32 v14, s35
+; GFX12-NEXT: v_dual_mov_b32 v15, s42 :: v_dual_mov_b32 v16, s43
+; GFX12-NEXT: v_dual_mov_b32 v17, s40 :: v_dual_mov_b32 v18, s41
+; GFX12-NEXT: v_dual_mov_b32 v19, s38 :: v_dual_mov_b32 v20, s39
+; GFX12-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21
+; GFX12-NEXT: v_dual_mov_b32 v23, s36 :: v_dual_mov_b32 v24, s37
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[16:17] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[16:17] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[16:17] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[16:17] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[16:17] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[16:17] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s30 :: v_dual_mov_b32 v2, s31
+; GFX12-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29
+; GFX12-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v7, s26 :: v_dual_mov_b32 v8, s27
+; GFX12-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX12-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX12-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1
+; GFX12-NEXT: v_dual_mov_b32 v15, s10 :: v_dual_mov_b32 v16, s11
+; GFX12-NEXT: v_dual_mov_b32 v17, s8 :: v_dual_mov_b32 v18, s9
+; GFX12-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v20, s7
+; GFX12-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX12-NEXT: v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v24, s5
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[16:17] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[16:17] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i16>, ptr addrspace(4) %in
%ext = sext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index ffc2cd23ec251f..16f95409055b19 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -5,6 +5,7 @@
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-HSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-HSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-NOHSA-LABEL: constant_load_i32:
@@ -69,6 +70,18 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2
; GFX9-HSA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load i32, ptr addrspace(4) %in
store i32 %ld, ptr addrspace(1) %out
@@ -142,6 +155,19 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3
; GFX9-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v2i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <2 x i32>, ptr addrspace(4) %in
store <2 x i32> %ld, ptr addrspace(1) %out
@@ -226,6 +252,19 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX9-HSA-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v3i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i32>, ptr addrspace(4) %in
store <3 x i32> %ld, ptr addrspace(1) %out
@@ -307,6 +346,20 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7
; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v4i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <4 x i32>, ptr addrspace(4) %in
store <4 x i32> %ld, ptr addrspace(1) %out
@@ -421,6 +474,24 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:16
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v8i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <8 x i32>, ptr addrspace(4) %in
store <8 x i32> %ld, ptr addrspace(1) %out
@@ -562,6 +633,27 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v9i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, s12
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: global_store_b32 v8, v9, s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <9 x i32>, ptr addrspace(4) %in
store <9 x i32> %ld, ptr addrspace(1) %out
@@ -708,6 +800,28 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v10i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v10, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v10, v[4:7], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <10 x i32>, ptr addrspace(4) %in
store <10 x i32> %ld, ptr addrspace(1) %out
@@ -865,6 +979,28 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9]
; GFX9-HSA-NEXT: global_store_dwordx3 v7, v[4:6], s[8:9] offset:32
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v11i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <11 x i32>, ptr addrspace(4) %in
store <11 x i32> %ld, ptr addrspace(1) %out
@@ -1019,6 +1155,29 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX9-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[8:9] offset:16
; GFX9-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[8:9]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v12i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v12, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
+; GFX12-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14
+; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
+; GFX12-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
+; GFX12-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX12-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v12, v[8:11], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <12 x i32>, ptr addrspace(4) %in
store <12 x i32> %ld, ptr addrspace(1) %out
@@ -1203,6 +1362,30 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v16i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
+; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5
+; GFX12-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3
+; GFX12-NEXT: v_mov_b32_e32 v14, s2
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <16 x i32>, ptr addrspace(4) %in
store <16 x i32> %ld, ptr addrspace(1) %out
@@ -1276,6 +1459,18 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX9-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_i32_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load i32, ptr addrspace(4) %in
%ext = zext i32 %ld to i64
store i64 %ext, ptr addrspace(1) %out
@@ -1354,6 +1549,21 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3
; GFX9-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_i32_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load i32, ptr addrspace(4) %in
%ext = sext i32 %ld to i64
store i64 %ext, ptr addrspace(1) %out
@@ -1427,6 +1637,18 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX9-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_v1i32_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -1505,6 +1727,21 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3
; GFX9-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_v1i32_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -1589,6 +1826,20 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_v2i32_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -1686,6 +1937,23 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s4
; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_v2i32_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s4, s3, 31
+; GFX12-NEXT: s_ashr_i32 s5, s2, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -1799,6 +2067,23 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_v4i32_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -1938,6 +2223,28 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_v4i32_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s8, s7, 31
+; GFX12-NEXT: s_ashr_i32 s9, s6, 31
+; GFX12-NEXT: s_ashr_i32 s2, s5, 31
+; GFX12-NEXT: s_ashr_i32 s3, s4, 31
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -2114,6 +2421,29 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1
; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[8:9]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_v8i32_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -2350,6 +2680,38 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s2
; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_v8i32_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s16, s11, 31
+; GFX12-NEXT: s_ashr_i32 s17, s10, 31
+; GFX12-NEXT: s_ashr_i32 s14, s9, 31
+; GFX12-NEXT: s_ashr_i32 s15, s8, 31
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
+; GFX12-NEXT: s_ashr_i32 s12, s7, 31
+; GFX12-NEXT: s_ashr_i32 s13, s6, 31
+; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
+; GFX12-NEXT: s_ashr_i32 s2, s5, 31
+; GFX12-NEXT: s_ashr_i32 s3, s4, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
+; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -2777,6 +3139,59 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s18
; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_v16i32_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s28, s11, 31
+; GFX12-NEXT: s_ashr_i32 s29, s10, 31
+; GFX12-NEXT: s_ashr_i32 s33, s15, 31
+; GFX12-NEXT: s_ashr_i32 s34, s14, 31
+; GFX12-NEXT: s_ashr_i32 s26, s9, 31
+; GFX12-NEXT: s_ashr_i32 s27, s8, 31
+; GFX12-NEXT: s_ashr_i32 s30, s13, 31
+; GFX12-NEXT: s_ashr_i32 s31, s12, 31
+; GFX12-NEXT: v_dual_mov_b32 v28, 0 :: v_dual_mov_b32 v1, s34
+; GFX12-NEXT: s_ashr_i32 s24, s7, 31
+; GFX12-NEXT: s_ashr_i32 s25, s6, 31
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s33
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
+; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s27
+; GFX12-NEXT: s_ashr_i32 s22, s5, 31
+; GFX12-NEXT: s_ashr_i32 s23, s4, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s26
+; GFX12-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v17, s25
+; GFX12-NEXT: s_ashr_i32 s20, s3, 31
+; GFX12-NEXT: s_ashr_i32 s21, s2, 31
+; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s24
+; GFX12-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v21, s23
+; GFX12-NEXT: s_ashr_i32 s18, s1, 31
+; GFX12-NEXT: s_ashr_i32 s19, s0, 31
+; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s22
+; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v25, s21
+; GFX12-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v27, s20
+; GFX12-NEXT: v_mov_b32_e32 v26, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: global_store_b128 v28, v[4:7], s[16:17] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v28, v[8:11], s[16:17] offset:80
+; GFX12-NEXT: global_store_b128 v28, v[12:15], s[16:17] offset:64
+; GFX12-NEXT: global_store_b128 v28, v[16:19], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v28, v[20:23], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -3085,6 +3500,41 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1
; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_v16i32_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s14
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s15
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s12
+; GFX12-NEXT: v_mov_b32_e32 v2, s13
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s10
+; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: v_mov_b32_e32 v2, s9
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s7
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -3902,6 +4352,104 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s33
; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_sextload_v32i32_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_ashr_i32 s49, s15, 31
+; GFX12-NEXT: s_ashr_i32 s64, s31, 31
+; GFX12-NEXT: s_ashr_i32 s65, s30, 31
+; GFX12-NEXT: s_ashr_i32 s62, s29, 31
+; GFX12-NEXT: s_ashr_i32 s63, s28, 31
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s65
+; GFX12-NEXT: s_ashr_i32 s60, s27, 31
+; GFX12-NEXT: s_ashr_i32 s61, s26, 31
+; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s64
+; GFX12-NEXT: v_dual_mov_b32 v2, s31 :: v_dual_mov_b32 v5, s63
+; GFX12-NEXT: s_ashr_i32 s58, s25, 31
+; GFX12-NEXT: s_ashr_i32 s59, s24, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s28 :: v_dual_mov_b32 v7, s62
+; GFX12-NEXT: v_dual_mov_b32 v6, s29 :: v_dual_mov_b32 v9, s61
+; GFX12-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v11, s60
+; GFX12-NEXT: v_dual_mov_b32 v10, s27 :: v_dual_mov_b32 v13, s59
+; GFX12-NEXT: s_ashr_i32 s57, s23, 31
+; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s58
+; GFX12-NEXT: s_ashr_i32 s24, s22, 31
+; GFX12-NEXT: v_mov_b32_e32 v14, s25
+; GFX12-NEXT: s_ashr_i32 s55, s21, 31
+; GFX12-NEXT: s_ashr_i32 s56, s20, 31
+; GFX12-NEXT: s_ashr_i32 s53, s19, 31
+; GFX12-NEXT: s_ashr_i32 s54, s18, 31
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22
+; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23
+; GFX12-NEXT: v_mov_b32_e32 v5, s56
+; GFX12-NEXT: s_ashr_i32 s51, s17, 31
+; GFX12-NEXT: s_ashr_i32 s52, s16, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s55
+; GFX12-NEXT: v_dual_mov_b32 v6, s21 :: v_dual_mov_b32 v9, s54
+; GFX12-NEXT: s_ashr_i32 s50, s14, 31
+; GFX12-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v11, s53
+; GFX12-NEXT: v_dual_mov_b32 v10, s19 :: v_dual_mov_b32 v13, s52
+; GFX12-NEXT: s_ashr_i32 s45, s11, 31
+; GFX12-NEXT: s_ashr_i32 s46, s10, 31
+; GFX12-NEXT: s_ashr_i32 s47, s13, 31
+; GFX12-NEXT: s_ashr_i32 s48, s12, 31
+; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s51
+; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v17, s50
+; GFX12-NEXT: s_ashr_i32 s43, s9, 31
+; GFX12-NEXT: s_ashr_i32 s44, s8, 31
+; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s49
+; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v21, s48
+; GFX12-NEXT: s_ashr_i32 s41, s7, 31
+; GFX12-NEXT: s_ashr_i32 s42, s6, 31
+; GFX12-NEXT: v_dual_mov_b32 v20, s12 :: v_dual_mov_b32 v23, s47
+; GFX12-NEXT: v_mov_b32_e32 v22, s13
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s10
+; GFX12-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v2, s11
+; GFX12-NEXT: v_mov_b32_e32 v5, s44
+; GFX12-NEXT: s_ashr_i32 s39, s5, 31
+; GFX12-NEXT: s_ashr_i32 s40, s4, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s43
+; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s42
+; GFX12-NEXT: s_ashr_i32 s35, s3, 31
+; GFX12-NEXT: s_ashr_i32 s38, s2, 31
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s41
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s40
+; GFX12-NEXT: s_ashr_i32 s33, s1, 31
+; GFX12-NEXT: s_ashr_i32 s34, s0, 31
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s39
+; GFX12-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s38
+; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s35
+; GFX12-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v21, s34
+; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s33
+; GFX12-NEXT: v_mov_b32_e32 v22, s1
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -4480,6 +5028,67 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1
; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[36:37]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_zextload_v32i32_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: v_mov_b32_e32 v2, s31
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:240
+; GFX12-NEXT: v_mov_b32_e32 v0, s28
+; GFX12-NEXT: v_mov_b32_e32 v2, s29
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:224
+; GFX12-NEXT: v_mov_b32_e32 v0, s26
+; GFX12-NEXT: v_mov_b32_e32 v2, s27
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:208
+; GFX12-NEXT: v_mov_b32_e32 v0, s24
+; GFX12-NEXT: v_mov_b32_e32 v2, s25
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v0, s22
+; GFX12-NEXT: v_mov_b32_e32 v2, s23
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:176
+; GFX12-NEXT: v_mov_b32_e32 v0, s20
+; GFX12-NEXT: v_mov_b32_e32 v2, s21
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s18
+; GFX12-NEXT: v_mov_b32_e32 v2, s19
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v2, s17
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:128
+; GFX12-NEXT: v_mov_b32_e32 v0, s14
+; GFX12-NEXT: v_mov_b32_e32 v2, s15
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s12
+; GFX12-NEXT: v_mov_b32_e32 v2, s13
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s10
+; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: v_mov_b32_e32 v2, s9
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s7
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -4814,6 +5423,44 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3
; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37]
; GFX9-HSA-NEXT: s_endpgm
+;
+; GFX12-LABEL: constant_load_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
+; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
+; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
+; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27
+; GFX12-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21
+; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17
+; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19
+; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13
+; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15
+; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9
+; GFX12-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11
+; GFX12-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5
+; GFX12-NEXT: v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7
+; GFX12-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1
+; GFX12-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3
+; GFX12-NEXT: v_mov_b32_e32 v30, s2
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
store <32 x i32> %ld, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 17061e41b5bb67..9b3830671acbd6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-LABEL: constant_load_i64:
@@ -59,6 +60,19 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load i64, ptr addrspace(4) %in
store i64 %ld, ptr addrspace(1) %out
ret void
@@ -125,6 +139,20 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v2i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <2 x i64>, ptr addrspace(4) %in
store <2 x i64> %ld, ptr addrspace(1) %out
@@ -222,6 +250,25 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v3i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i64>, ptr addrspace(4) %in
store <3 x i64> %ld, ptr addrspace(1) %out
@@ -322,6 +369,24 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 17:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v4i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <4 x i64>, ptr addrspace(4) %in
store <4 x i64> %ld, ptr addrspace(1) %out
@@ -490,6 +555,30 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 35:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v8i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
+; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5
+; GFX12-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3
+; GFX12-NEXT: v_mov_b32_e32 v14, s2
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <8 x i64>, ptr addrspace(4) %in
store <8 x i64> %ld, ptr addrspace(1) %out
@@ -807,6 +896,44 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; EG-NEXT: ALU clause starting at 71:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v16i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
+; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
+; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
+; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27
+; GFX12-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21
+; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17
+; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19
+; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13
+; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15
+; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9
+; GFX12-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11
+; GFX12-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5
+; GFX12-NEXT: v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7
+; GFX12-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1
+; GFX12-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3
+; GFX12-NEXT: v_mov_b32_e32 v30, s2
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <16 x i64>, ptr addrspace(4) %in
store <16 x i64> %ld, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 9ebd2018798253..f18a34515a8265 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
; TODO: NOT AND
define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
@@ -74,6 +75,18 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load i8, ptr addrspace(4) %in
store i8 %ld, ptr addrspace(1) %out
@@ -149,6 +162,18 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v2i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <2 x i8>, ptr addrspace(4) %in
store <2 x i8> %ld, ptr addrspace(1) %out
@@ -250,6 +275,20 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v3i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i8>, ptr addrspace(4) %in
store <3 x i8> %ld, ptr addrspace(1) %out
@@ -308,6 +347,18 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v4i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <4 x i8>, ptr addrspace(4) %in
store <4 x i8> %ld, ptr addrspace(1) %out
@@ -369,6 +420,19 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v8i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <8 x i8>, ptr addrspace(4) %in
store <8 x i8> %ld, ptr addrspace(1) %out
@@ -436,6 +500,20 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_load_v16i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <16 x i8>, ptr addrspace(4) %in
store <16 x i8> %ld, ptr addrspace(1) %out
@@ -501,6 +579,18 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i8_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = zext i8 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -567,6 +657,18 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_sextload_i8_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%ld = load i8, ptr addrspace(4) %in
%ext = sext i8 %ld to i32
store i32 %ext, ptr addrspace(1) %out
@@ -632,6 +734,18 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i8_to_v1i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = zext <1 x i8> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -698,6 +812,18 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_sextload_v1i8_to_v1i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = sext <1 x i8> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -782,6 +908,22 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
; EG-NEXT: AND_INT T4.X, T0.W, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_zextload_v2i8_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = zext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -866,6 +1008,22 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
; EG-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v2i8_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = sext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -951,6 +1109,23 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v3i8_to_v3i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: s_and_b32 s3, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i8>, ptr addrspace(4) %in
%ext = zext <3 x i8> %ld to <3 x i32>
@@ -1038,6 +1213,24 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; EG-NEXT: LSHR T4.X, PS, literal.x,
; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+;
+; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: s_sext_i32_i8 s3, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <3 x i8>, ptr addrspace(4) %in
%ext = sext <3 x i8> %ld to <3 x i32>
@@ -1126,6 +1319,25 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_zextload_v4i8_to_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: s_and_b32 s4, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(4) %in
%ext = zext <4 x i8> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1216,6 +1428,25 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+;
+; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: s_ashr_i32 s3, s2, 24
+; GFX12-NEXT: s_sext_i32_i8 s4, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(4) %in
%ext = sext <4 x i8> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1349,6 +1580,32 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: s_lshr_b32 s5, s2, 24
+; GFX12-NEXT: s_and_b32 s7, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: s_and_b32 s6, s3, 0xff
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = zext <8 x i8> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1487,6 +1744,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; EG-NEXT: LSHR T8.X, PS, literal.x,
; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+;
+; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: s_ashr_i32 s6, s2, 24
+; GFX12-NEXT: s_sext_i32_i8 s7, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: s_ashr_i32 s4, s3, 24
+; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s3, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v4, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = sext <8 x i8> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1711,6 +1996,46 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
+; GFX12-NEXT: s_lshr_b32 s8, s6, 24
+; GFX12-NEXT: s_lshr_b32 s9, s7, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
+; GFX12-NEXT: s_and_b32 s12, s6, 0xff
+; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT: s_and_b32 s13, s7, 0xff
+; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT: s_and_b32 s11, s5, 0xff
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9
+; GFX12-NEXT: s_lshr_b32 s3, s5, 24
+; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
+; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: s_lshr_b32 s2, s4, 24
+; GFX12-NEXT: s_and_b32 s10, s4, 0xff
+; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
+; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = zext <16 x i8> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -1947,6 +2272,50 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; EG-NEXT: LSHR T14.X, PS, literal.x,
; EG-NEXT: BFE_INT * T12.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+;
+; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
+; GFX12-NEXT: s_ashr_i32 s12, s7, 24
+; GFX12-NEXT: s_sext_i32_i8 s13, s7
+; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80010
+; GFX12-NEXT: s_ashr_i32 s10, s6, 24
+; GFX12-NEXT: s_bfe_i32 s11, s6, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s6, s6
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12
+; GFX12-NEXT: s_ashr_i32 s8, s5, 24
+; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: s_ashr_i32 s2, s4, 24
+; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s4, s4
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_mov_b32_e32 v6, s11
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v10, s9
+; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v12, s4
+; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = sext <16 x i8> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2352,6 +2721,77 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
+; GFX12-NEXT: s_lshr_b32 s15, s9, 24
+; GFX12-NEXT: s_lshr_b32 s17, s11, 24
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
+; GFX12-NEXT: s_and_b32 s23, s9, 0xff
+; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
+; GFX12-NEXT: s_and_b32 s25, s11, 0xff
+; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX12-NEXT: s_lshr_b32 s14, s8, 24
+; GFX12-NEXT: s_lshr_b32 s16, s10, 24
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
+; GFX12-NEXT: s_and_b32 s22, s8, 0xff
+; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT: s_and_b32 s24, s10, 0xff
+; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17
+; GFX12-NEXT: s_lshr_b32 s13, s7, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
+; GFX12-NEXT: s_and_b32 s21, s7, 0xff
+; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11
+; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10
+; GFX12-NEXT: v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15
+; GFX12-NEXT: v_mov_b32_e32 v26, s7
+; GFX12-NEXT: s_lshr_b32 s12, s6, 24
+; GFX12-NEXT: s_and_b32 s20, s6, 0xff
+; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
+; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
+; GFX12-NEXT: s_lshr_b32 s3, s5, 24
+; GFX12-NEXT: s_and_b32 s19, s5, 0xff
+; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6
+; GFX12-NEXT: s_lshr_b32 s2, s4, 24
+; GFX12-NEXT: s_and_b32 s18, s4, 0xff
+; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = zext <32 x i8> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -2788,6 +3228,84 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; EG-NEXT: LSHR T26.X, PS, literal.x,
; EG-NEXT: BFE_INT * T24.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+;
+; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
+; GFX12-NEXT: s_ashr_i32 s20, s9, 24
+; GFX12-NEXT: s_bfe_i32 s21, s9, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s9, s9
+; GFX12-NEXT: s_ashr_i32 s24, s11, 24
+; GFX12-NEXT: s_sext_i32_i8 s25, s11
+; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
+; GFX12-NEXT: s_ashr_i32 s18, s8, 24
+; GFX12-NEXT: s_bfe_i32 s19, s8, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s8, s8
+; GFX12-NEXT: s_ashr_i32 s22, s10, 24
+; GFX12-NEXT: s_bfe_i32 s23, s10, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
+; GFX12-NEXT: s_ashr_i32 s12, s5, 24
+; GFX12-NEXT: s_ashr_i32 s14, s6, 24
+; GFX12-NEXT: s_ashr_i32 s16, s7, 24
+; GFX12-NEXT: s_bfe_i32 s17, s7, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s7, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
+; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8
+; GFX12-NEXT: v_bfe_i32 v29, v10, 0, 8
+; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12
+; GFX12-NEXT: v_mov_b32_e32 v11, s20
+; GFX12-NEXT: s_ashr_i32 s2, s4, 24
+; GFX12-NEXT: s_bfe_i32 s15, s6, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s6, s6
+; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
+; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_mov_b32_e32 v30, s19
+; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s5, s5
+; GFX12-NEXT: v_mov_b32_e32 v24, s7
+; GFX12-NEXT: v_mov_b32_e32 v26, s17
+; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s4, s4
+; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v20, s6
+; GFX12-NEXT: v_mov_b32_e32 v22, s15
+; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v16, s5
+; GFX12-NEXT: v_mov_b32_e32 v18, s13
+; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v12, s4
+; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = sext <32 x i8> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3565,6 +4083,140 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v64i8_to_v64i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_lshr_b32 s34, s15, 24
+; GFX12-NEXT: s_and_b32 s50, s15, 0xff
+; GFX12-NEXT: s_bfe_u32 s15, s15, 0x80010
+; GFX12-NEXT: s_lshr_b32 s27, s9, 24
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v15, 8, s4
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_dual_mov_b32 v52, 0 :: v_dual_and_b32 v5, 0xffff, v5
+; GFX12-NEXT: v_dual_mov_b32 v48, s50 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: v_mov_b32_e32 v50, s15
+; GFX12-NEXT: s_and_b32 s44, s9, 0xff
+; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v49, 0xffff, v0
+; GFX12-NEXT: v_mov_b32_e32 v51, s34
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
+; GFX12-NEXT: s_lshr_b32 s31, s13, 24
+; GFX12-NEXT: s_lshr_b32 s33, s14, 24
+; GFX12-NEXT: s_and_b32 s43, s8, 0xff
+; GFX12-NEXT: s_and_b32 s48, s13, 0xff
+; GFX12-NEXT: s_and_b32 s49, s14, 0xff
+; GFX12-NEXT: s_bfe_u32 s14, s14, 0x80010
+; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT: s_lshr_b32 s30, s12, 24
+; GFX12-NEXT: s_and_b32 s47, s12, 0xff
+; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT: s_lshr_b32 s26, s8, 24
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: v_dual_mov_b32 v44, s49 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT: v_dual_mov_b32 v46, s14 :: v_dual_and_b32 v17, 0xffff, v15
+; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v14
+; GFX12-NEXT: v_and_b32_e32 v23, 0xffff, v12
+; GFX12-NEXT: s_and_b32 s42, s7, 0xff
+; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v45, 0xffff, v2
+; GFX12-NEXT: v_dual_mov_b32 v47, s33 :: v_dual_mov_b32 v30, s43
+; GFX12-NEXT: global_store_b128 v52, v[48:51], s[16:17] offset:240
+; GFX12-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_and_b32 v49, 0xffff, v3
+; GFX12-NEXT: v_mov_b32_e32 v48, s48
+; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s31
+; GFX12-NEXT: v_mov_b32_e32 v26, s42
+; GFX12-NEXT: s_lshr_b32 s25, s7, 24
+; GFX12-NEXT: s_lshr_b32 s28, s10, 24
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s10
+; GFX12-NEXT: v_dual_mov_b32 v40, s47 :: v_dual_and_b32 v27, 0xffff, v11
+; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v42, s12 :: v_dual_and_b32 v31, 0xffff, v10
+; GFX12-NEXT: s_and_b32 s45, s10, 0xff
+; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
+; GFX12-NEXT: v_and_b32_e32 v41, 0xffff, v4
+; GFX12-NEXT: v_dual_mov_b32 v43, s30 :: v_dual_mov_b32 v28, s7
+; GFX12-NEXT: s_lshr_b32 s29, s11, 24
+; GFX12-NEXT: s_and_b32 s46, s11, 0xff
+; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX12-NEXT: s_lshr_b32 s24, s6, 24
+; GFX12-NEXT: s_and_b32 s41, s6, 0xff
+; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v8
+; GFX12-NEXT: global_store_b128 v52, v[44:47], s[16:17] offset:224
+; GFX12-NEXT: v_mov_b32_e32 v46, s29
+; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v6
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v52, v[48:51], s[16:17] offset:208
+; GFX12-NEXT: global_store_b128 v52, v[40:43], s[16:17] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v43, s46 :: v_dual_mov_b32 v22, s41
+; GFX12-NEXT: v_dual_mov_b32 v45, s11 :: v_dual_mov_b32 v24, s6
+; GFX12-NEXT: s_and_b32 s40, s5, 0xff
+; GFX12-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_and_b32 v39, 0xffff, v7
+; GFX12-NEXT: v_dual_mov_b32 v40, s10 :: v_dual_mov_b32 v41, s28
+; GFX12-NEXT: v_mov_b32_e32 v20, s40
+; GFX12-NEXT: s_lshr_b32 s23, s5, 24
+; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX12-NEXT: v_mov_b32_e32 v37, s27
+; GFX12-NEXT: s_lshr_b32 s22, s4, 24
+; GFX12-NEXT: s_and_b32 s38, s3, 0xff
+; GFX12-NEXT: s_and_b32 s39, s4, 0xff
+; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39
+; GFX12-NEXT: v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4
+; GFX12-NEXT: s_lshr_b32 s21, s3, 24
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v12, s38
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v52, v[43:46], s[16:17] offset:176
+; GFX12-NEXT: global_store_b128 v52, v[38:41], s[16:17] offset:160
+; GFX12-NEXT: global_store_b128 v52, v[34:37], s[16:17] offset:144
+; GFX12-NEXT: global_store_b128 v52, v[30:33], s[16:17] offset:128
+; GFX12-NEXT: global_store_b128 v52, v[26:29], s[16:17] offset:112
+; GFX12-NEXT: global_store_b128 v52, v[22:25], s[16:17] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23
+; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: s_lshr_b32 s20, s2, 24
+; GFX12-NEXT: s_and_b32 s37, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37
+; GFX12-NEXT: s_lshr_b32 s19, s1, 24
+; GFX12-NEXT: s_and_b32 s36, s1, 0xff
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v15, s21 :: v_dual_mov_b32 v10, s2
+; GFX12-NEXT: s_lshr_b32 s18, s0, 24
+; GFX12-NEXT: s_and_b32 s35, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36
+; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19
+; GFX12-NEXT: v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v52, v[20:23], s[16:17] offset:80
+; GFX12-NEXT: global_store_b128 v52, v[16:19], s[16:17] offset:64
+; GFX12-NEXT: global_store_b128 v52, v[12:15], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v52, v[8:11], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v52, v[4:7], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v52, v[0:3], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i8>, ptr addrspace(4) %in
%ext = zext <64 x i8> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4396,6 +5048,151 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; EG-NEXT: LSHR T50.X, PS, literal.x,
; EG-NEXT: BFE_INT * T48.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+;
+; GFX12-LABEL: constant_sextload_v64i8_to_v64i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15
+; GFX12-NEXT: s_ashr_i32 s49, s15, 24
+; GFX12-NEXT: s_bfe_i32 s50, s15, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s15, s15
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s13
+; GFX12-NEXT: v_dual_mov_b32 v54, 0 :: v_dual_mov_b32 v45, s49
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s12
+; GFX12-NEXT: s_ashr_i32 s45, s13, 24
+; GFX12-NEXT: s_ashr_i32 s47, s14, 24
+; GFX12-NEXT: v_bfe_i32 v43, v0, 0, 8
+; GFX12-NEXT: v_dual_mov_b32 v42, s15 :: v_dual_mov_b32 v49, s47
+; GFX12-NEXT: v_dual_mov_b32 v44, s50 :: v_dual_mov_b32 v53, s45
+; GFX12-NEXT: s_bfe_i32 s46, s13, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s13, s13
+; GFX12-NEXT: s_bfe_i32 s48, s14, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s14, s14
+; GFX12-NEXT: s_ashr_i32 s43, s12, 24
+; GFX12-NEXT: s_bfe_i32 s44, s12, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s12, s12
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11
+; GFX12-NEXT: s_ashr_i32 s39, s10, 24
+; GFX12-NEXT: s_ashr_i32 s41, s11, 24
+; GFX12-NEXT: s_bfe_i32 s42, s11, 0x80010
+; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v46, s14
+; GFX12-NEXT: v_bfe_i32 v51, v3, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v50, s13
+; GFX12-NEXT: v_mov_b32_e32 v52, s46
+; GFX12-NEXT: s_sext_i32_i8 s11, s11
+; GFX12-NEXT: v_mov_b32_e32 v48, s48
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s10
+; GFX12-NEXT: s_ashr_i32 s35, s8, 24
+; GFX12-NEXT: s_ashr_i32 s37, s9, 24
+; GFX12-NEXT: v_bfe_i32 v41, v4, 0, 8
+; GFX12-NEXT: global_store_b128 v54, v[42:45], s[16:17] offset:240
+; GFX12-NEXT: v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v37, s37
+; GFX12-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v33, s35
+; GFX12-NEXT: v_mov_b32_e32 v43, s43
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
+; GFX12-NEXT: s_bfe_i32 s40, s10, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v15, 8, s4
+; GFX12-NEXT: s_ashr_i32 s33, s7, 24
+; GFX12-NEXT: s_bfe_i32 s38, s9, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s9, s9
+; GFX12-NEXT: global_store_b128 v54, v[46:49], s[16:17] offset:224
+; GFX12-NEXT: v_bfe_i32 v44, v6, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v45, s42
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v54, v[50:53], s[16:17] offset:208
+; GFX12-NEXT: global_store_b128 v54, v[40:43], s[16:17] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v43, s11 :: v_dual_mov_b32 v46, s41
+; GFX12-NEXT: v_mov_b32_e32 v29, s33
+; GFX12-NEXT: s_ashr_i32 s28, s5, 24
+; GFX12-NEXT: s_ashr_i32 s30, s6, 24
+; GFX12-NEXT: s_bfe_i32 s36, s8, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s8, s8
+; GFX12-NEXT: v_bfe_i32 v39, v7, 0, 8
+; GFX12-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v25, s30
+; GFX12-NEXT: v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v41, s39
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
+; GFX12-NEXT: s_ashr_i32 s24, s3, 24
+; GFX12-NEXT: s_ashr_i32 s26, s4, 24
+; GFX12-NEXT: s_bfe_i32 s31, s6, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s6, s6
+; GFX12-NEXT: s_bfe_i32 s34, s7, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s7, s7
+; GFX12-NEXT: v_bfe_i32 v35, v8, 0, 8
+; GFX12-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v19, s26
+; GFX12-NEXT: v_mov_b32_e32 v36, s38
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
+; GFX12-NEXT: s_ashr_i32 s18, s0, 24
+; GFX12-NEXT: s_ashr_i32 s20, s1, 24
+; GFX12-NEXT: s_ashr_i32 s22, s2, 24
+; GFX12-NEXT: s_bfe_i32 s29, s5, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s5, s5
+; GFX12-NEXT: v_bfe_i32 v31, v10, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v30, s8
+; GFX12-NEXT: v_dual_mov_b32 v32, s36 :: v_dual_mov_b32 v7, s20
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_bfe_i32 s27, s4, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s4, s4
+; GFX12-NEXT: v_bfe_i32 v23, v12, 0, 8
+; GFX12-NEXT: v_bfe_i32 v27, v11, 0, 8
+; GFX12-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v3, s18
+; GFX12-NEXT: v_mov_b32_e32 v28, s34
+; GFX12-NEXT: v_mov_b32_e32 v22, s6
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_bfe_i32 s25, s3, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s3, s3
+; GFX12-NEXT: v_bfe_i32 v17, v15, 0, 8
+; GFX12-NEXT: v_bfe_i32 v21, v14, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v24, s31
+; GFX12-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v15, s24
+; GFX12-NEXT: v_mov_b32_e32 v11, s22
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v54, v[43:46], s[16:17] offset:176
+; GFX12-NEXT: global_store_b128 v54, v[38:41], s[16:17] offset:160
+; GFX12-NEXT: global_store_b128 v54, v[34:37], s[16:17] offset:144
+; GFX12-NEXT: global_store_b128 v54, v[30:33], s[16:17] offset:128
+; GFX12-NEXT: global_store_b128 v54, v[26:29], s[16:17] offset:112
+; GFX12-NEXT: global_store_b128 v54, v[22:25], s[16:17] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28
+; GFX12-NEXT: s_bfe_i32 s23, s2, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s2
+; GFX12-NEXT: v_mov_b32_e32 v16, s4
+; GFX12-NEXT: v_mov_b32_e32 v18, s27
+; GFX12-NEXT: s_bfe_i32 s21, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
+; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v12, s3
+; GFX12-NEXT: v_mov_b32_e32 v14, s25
+; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s0, s0
+; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v8, s2
+; GFX12-NEXT: v_mov_b32_e32 v10, s23
+; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v4, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s21
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s19
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v54, v[20:23], s[16:17] offset:80
+; GFX12-NEXT: global_store_b128 v54, v[16:19], s[16:17] offset:64
+; GFX12-NEXT: global_store_b128 v54, v[12:15], s[16:17] offset:48
+; GFX12-NEXT: global_store_b128 v54, v[8:11], s[16:17] offset:32
+; GFX12-NEXT: global_store_b128 v54, v[4:7], s[16:17] offset:16
+; GFX12-NEXT: global_store_b128 v54, v[0:3], s[16:17]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <64 x i8>, ptr addrspace(4) %in
%ext = sext <64 x i8> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4466,6 +5263,19 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i8_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = zext i8 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4539,6 +5349,21 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_i8_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = sext i8 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4608,6 +5433,18 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i8_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = zext <1 x i8> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4681,6 +5518,21 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v1i8_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = sext <1 x i8> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4772,6 +5624,22 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
; EG-NEXT: MOV T4.W, 0.0,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
+;
+; GFX12-LABEL: constant_zextload_v2i8_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = zext <2 x i8> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4867,6 +5735,25 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; EG-NEXT: ASHR * T4.W, PV.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v2i8_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v0, v4, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = sext <2 x i8> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4980,6 +5867,29 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: s_lshr_b32 s4, s2, 24
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(4) %in
%ext = zext <4 x i8> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5108,6 +6018,32 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; EG-NEXT: ASHR T4.Y, PV.X, literal.y,
; EG-NEXT: ASHR * T5.W, PV.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s6
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(4) %in
%ext = sext <4 x i8> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5288,6 +6224,40 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT: s_lshr_b32 s5, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s4, s2, 24
+; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: s_and_b32 s2, s3, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = zext <8 x i8> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5506,6 +6476,46 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; EG-NEXT: ASHR T11.W, PV.Z, literal.y,
; EG-NEXT: ASHR * T7.W, T7.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3
+; GFX12-NEXT: s_lshr_b32 s6, s3, 16
+; GFX12-NEXT: s_lshr_b32 s8, s2, 16
+; GFX12-NEXT: s_lshr_b32 s10, s2, 24
+; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
+; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
+; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-NEXT: v_mov_b32_e32 v12, s4
+; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = sext <8 x i8> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5820,6 +6830,61 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR * T22.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_lshr_b32 s3, s7, 24
+; GFX12-NEXT: s_lshr_b32 s2, s5, 24
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_lshr_b32 s2, s6, 24
+; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_lshr_b32 s2, s4, 24
+; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_and_b32 s2, s6, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
+; GFX12-NEXT: s_and_b32 s2, s7, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
+; GFX12-NEXT: s_and_b32 s2, s5, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: s_and_b32 s2, s4, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = zext <16 x i8> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6214,6 +7279,72 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: ASHR T7.W, PV.Z, literal.y,
; EG-NEXT: ASHR * T13.W, T13.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v23, 8, s4
+; GFX12-NEXT: s_lshr_b32 s8, s7, 16
+; GFX12-NEXT: s_lshr_b32 s10, s6, 16
+; GFX12-NEXT: s_lshr_b32 s12, s6, 24
+; GFX12-NEXT: v_bfe_i32 v22, v10, 0, 8
+; GFX12-NEXT: v_bfe_i32 v10, v11, 0, 8
+; GFX12-NEXT: s_lshr_b32 s18, s4, 24
+; GFX12-NEXT: s_mov_b32 s20, s7
+; GFX12-NEXT: s_lshr_b32 s14, s5, 16
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: v_bfe_i32 v28, v21, 0, 8
+; GFX12-NEXT: s_lshr_b32 s16, s4, 16
+; GFX12-NEXT: s_mov_b32 s22, s5
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
+; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11
+; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15
+; GFX12-NEXT: v_bfe_i32 v24, v23, 0, 8
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25
+; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21
+; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23
+; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v17, s17
+; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GFX12-NEXT: v_mov_b32_e32 v26, s22
+; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19
+; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v30, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v30, v[20:23], s[0:1] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v30, v[12:15], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v30, v[8:11], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v30, v[4:7], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v30, v[26:29], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v30, v[16:19], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v30, v[22:25], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = sext <16 x i8> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6807,6 +7938,105 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR * T42.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v32i8_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
+; GFX12-NEXT: s_lshr_b32 s11, s7, 24
+; GFX12-NEXT: s_lshr_b32 s10, s5, 24
+; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
+; GFX12-NEXT: s_and_b32 s7, s7, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:240
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s10, s3, 24
+; GFX12-NEXT: s_bfe_u32 s11, s3, 0x80010
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:176
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s10, s1, 24
+; GFX12-NEXT: s_bfe_u32 s11, s1, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s10, s6, 24
+; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80010
+; GFX12-NEXT: s_and_b32 s6, s6, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s10, s4, 24
+; GFX12-NEXT: s_bfe_u32 s11, s4, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:208
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s10, s2, 24
+; GFX12-NEXT: s_bfe_u32 s11, s2, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
+; GFX12-NEXT: s_and_b32 s5, s5, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:224
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
+; GFX12-NEXT: s_and_b32 s4, s4, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: s_and_b32 s3, s3, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:128
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_and_b32 s1, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = zext <32 x i8> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -7567,6 +8797,124 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: ASHR T11.W, PV.Z, literal.y,
; EG-NEXT: ASHR * T26.W, T26.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
+;
+; GFX12-LABEL: constant_sextload_v32i8_to_v32i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
+; GFX12-NEXT: s_lshr_b32 s24, s7, 16
+; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8
+; GFX12-NEXT: s_lshr_b32 s42, s2, 24
+; GFX12-NEXT: s_mov_b32 s48, s7
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
+; GFX12-NEXT: s_lshr_b32 s26, s6, 16
+; GFX12-NEXT: s_lshr_b32 s44, s1, 16
+; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56
+; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
+; GFX12-NEXT: s_lshr_b32 s28, s6, 24
+; GFX12-NEXT: s_lshr_b32 s30, s5, 16
+; GFX12-NEXT: s_lshr_b32 s40, s2, 16
+; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8
+; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8
+; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8
+; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
+; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
+; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
+; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
+; GFX12-NEXT: v_mov_b32_e32 v30, s49
+; GFX12-NEXT: s_lshr_b32 s46, s0, 24
+; GFX12-NEXT: s_mov_b32 s50, s5
+; GFX12-NEXT: s_mov_b32 s52, s3
+; GFX12-NEXT: s_lshr_b32 s34, s4, 16
+; GFX12-NEXT: s_lshr_b32 s36, s4, 24
+; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56
+; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56
+; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX12-NEXT: s_lshr_b32 s38, s3, 16
+; GFX12-NEXT: s_mov_b32 s54, s1
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX12-NEXT: s_lshr_b32 s20, s0, 16
+; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56
+; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
+; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
+; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
+; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
+; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
+; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000
+; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240
+; GFX12-NEXT: v_mov_b32_e32 v33, s44
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224
+; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
+; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
+; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v17, s14
+; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
+; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
+; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
+; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
+; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
+; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
+; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
+; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
+; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[8:9] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[8:9] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[8:9] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[8:9] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[8:9] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[8:9] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[8:9] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[8:9] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[8:9] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[8:9] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = sext <32 x i8> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -7657,6 +9005,18 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_i8_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = zext i8 %a to i16
store i16 %ext, ptr addrspace(1) %out
@@ -7733,6 +9093,18 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_i8_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = sext i8 %a to i16
store i16 %ext, ptr addrspace(1) %out
@@ -7807,6 +9179,18 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v1i8_to_v1i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = zext <1 x i8> %load to <1 x i16>
store <1 x i16> %ext, ptr addrspace(1) %out
@@ -7883,6 +9267,18 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v1i8_to_v1i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = sext <1 x i8> %load to <1 x i16>
store <1 x i16> %ext, ptr addrspace(1) %out
@@ -7965,6 +9361,23 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: OR_INT T5.X, PS, PV.W,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = zext <2 x i8> %load to <2 x i16>
store <2 x i16> %ext, ptr addrspace(1) %out
@@ -8061,6 +9474,23 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: OR_INT T5.X, PS, PV.W,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = sext <2 x i8> %load to <2 x i16>
store <2 x i16> %ext, ptr addrspace(1) %out
@@ -8173,6 +9603,29 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T5.X, PV.Y,
; EG-NEXT: MOV * T8.X, T4.X,
+;
+; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s3, s2, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
+; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2
+; GFX12-NEXT: s_lshr_b32 s2, s2, 24
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1
+; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(4) %in
%ext = zext <4 x i8> %load to <4 x i16>
store <4 x i16> %ext, ptr addrspace(1) %out
@@ -8301,6 +9754,28 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T5.X, PV.Y,
; EG-NEXT: MOV * T8.X, T4.X,
+;
+; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000
+; GFX12-NEXT: s_lshr_b32 s3, s2, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
+; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4
+; GFX12-NEXT: s_ashr_i32 s2, s2, 24
+; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(4) %in
%ext = sext <4 x i8> %load to <4 x i16>
store <4 x i16> %ext, ptr addrspace(1) %out
@@ -8470,6 +9945,36 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T12.X, T8.X,
; EG-NEXT: MOV * T12.Z, T4.X,
+;
+; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s5, s2, 16
+; GFX12-NEXT: s_lshr_b32 s6, s3, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
+; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: s_lshr_b32 s4, s2, 24
+; GFX12-NEXT: s_lshr_b32 s2, s3, 24
+; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0
+; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2
+; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3
+; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = zext <8 x i8> %load to <8 x i16>
store <8 x i16> %ext, ptr addrspace(1) %out
@@ -8671,6 +10176,35 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T12.X, T8.X,
; EG-NEXT: MOV * T12.Z, T4.X,
+;
+; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000
+; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000
+; GFX12-NEXT: s_lshr_b32 s6, s2, 16
+; GFX12-NEXT: s_lshr_b32 s7, s3, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
+; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56
+; GFX12-NEXT: v_and_b32_e64 v3, 0xffff, s8
+; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s9
+; GFX12-NEXT: s_ashr_i32 s2, s2, 24
+; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000
+; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000
+; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3
+; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5
+; GFX12-NEXT: v_mov_b32_e32 v3, s3
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = sext <8 x i8> %load to <8 x i16>
store <8 x i16> %ext, ptr addrspace(1) %out
@@ -8972,6 +10506,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: MOV * T20.Z, T12.X,
; EG-NEXT: MOV T19.X, T8.X,
; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+;
+; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s3, s6, 16
+; GFX12-NEXT: s_lshr_b32 s9, s7, 16
+; GFX12-NEXT: s_lshr_b32 s11, s4, 16
+; GFX12-NEXT: s_lshr_b32 s13, s5, 16
+; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s5
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4
+; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7
+; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6
+; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9
+; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13
+; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s11
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s7
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s4
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s5
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX12-NEXT: s_lshr_b32 s2, s6, 24
+; GFX12-NEXT: s_lshr_b32 s8, s7, 24
+; GFX12-NEXT: s_lshr_b32 s10, s4, 24
+; GFX12-NEXT: s_lshr_b32 s12, s5, 24
+; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v4
+; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5
+; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6
+; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
+; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11
+; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9
+; GFX12-NEXT: v_lshl_or_b32 v1, s10, 16, v10
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = zext <16 x i8> %load to <16 x i16>
store <16 x i16> %ext, ptr addrspace(1) %out
@@ -9334,6 +10917,56 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: MOV * T20.Z, T12.X,
; EG-NEXT: MOV T19.X, T8.X,
; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+;
+; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s8, s6, 16
+; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6
+; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000
+; GFX12-NEXT: s_lshr_b32 s10, s4, 16
+; GFX12-NEXT: s_lshr_b32 s11, s5, 16
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, s4
+; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5
+; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000
+; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000
+; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
+; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6
+; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000
+; GFX12-NEXT: s_lshr_b32 s9, s7, 16
+; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX12-NEXT: s_bfe_i32 s3, s11, 0x80000
+; GFX12-NEXT: s_bfe_i32 s4, s10, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s7
+; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5
+; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12
+; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6
+; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11
+; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10
+; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3
+; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4
+; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4
+; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
+; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v11
+; GFX12-NEXT: v_lshl_or_b32 v0, v5, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v1, v13, 16, v16
+; GFX12-NEXT: v_lshl_or_b32 v7, v9, 16, v14
+; GFX12-NEXT: v_lshl_or_b32 v5, v10, 16, v15
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = sext <16 x i8> %load to <16 x i16>
store <16 x i16> %ext, ptr addrspace(1) %out
@@ -9895,6 +11528,93 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV * T38.Z, T28.X,
; EG-NEXT: MOV T35.X, T24.X,
; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+;
+; GFX12-LABEL: constant_zextload_v32i8_to_v32i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s25, s1, 16
+; GFX12-NEXT: s_lshr_b32 s21, s3, 16
+; GFX12-NEXT: s_lshr_b32 s23, s0, 16
+; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s1
+; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s2
+; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s5
+; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s4
+; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s25
+; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s23
+; GFX12-NEXT: v_and_b32_e64 v17, 0xff, s21
+; GFX12-NEXT: s_lshr_b32 s17, s5, 16
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s4
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1
+; GFX12-NEXT: v_and_b32_e64 v19, 0xff, s17
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_and_b32 v7, 0xffff, v7
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s0
+; GFX12-NEXT: v_and_b32_e32 v20, 0xffff, v15
+; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v17
+; GFX12-NEXT: s_lshr_b32 s11, s6, 16
+; GFX12-NEXT: s_lshr_b32 s13, s7, 16
+; GFX12-NEXT: s_lshr_b32 s24, s1, 24
+; GFX12-NEXT: s_lshr_b32 s15, s4, 16
+; GFX12-NEXT: s_lshr_b32 s20, s3, 24
+; GFX12-NEXT: s_lshr_b32 s19, s2, 16
+; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v6
+; GFX12-NEXT: v_lshl_or_b32 v6, v4, 16, v10
+; GFX12-NEXT: v_lshl_or_b32 v4, v3, 16, v11
+; GFX12-NEXT: v_lshl_or_b32 v3, s24, 16, v14
+; GFX12-NEXT: v_lshl_or_b32 v10, v9, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v8, v8, 16, v13
+; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s7
+; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s6
+; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s13
+; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s11
+; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v7
+; GFX12-NEXT: v_lshl_or_b32 v7, s20, 16, v15
+; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s15
+; GFX12-NEXT: v_and_b32_e64 v18, 0xff, s19
+; GFX12-NEXT: s_lshr_b32 s16, s5, 24
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
+; GFX12-NEXT: v_lshl_or_b32 v11, s16, 16, v17
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v14
+; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v15
+; GFX12-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX12-NEXT: s_lshr_b32 s10, s6, 24
+; GFX12-NEXT: s_lshr_b32 s12, s7, 24
+; GFX12-NEXT: s_lshr_b32 s14, s4, 24
+; GFX12-NEXT: s_lshr_b32 s18, s2, 24
+; GFX12-NEXT: v_lshl_or_b32 v14, v5, 16, v9
+; GFX12-NEXT: v_lshl_or_b32 v12, v1, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v15, s12, 16, v13
+; GFX12-NEXT: v_lshl_or_b32 v13, s10, 16, v17
+; GFX12-NEXT: s_lshr_b32 s22, s0, 24
+; GFX12-NEXT: v_lshl_or_b32 v9, s14, 16, v19
+; GFX12-NEXT: v_lshl_or_b32 v5, s18, 16, v18
+; GFX12-NEXT: v_lshl_or_b32 v1, s22, 16, v20
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = zext <32 x i8> %load to <32 x i16>
store <32 x i16> %ext, ptr addrspace(1) %out
@@ -10582,6 +12302,94 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: MOV * T38.Z, T28.X,
; EG-NEXT: MOV T35.X, T24.X,
; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+;
+; GFX12-LABEL: constant_sextload_v32i8_to_v32i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s12, s4, 16
+; GFX12-NEXT: s_lshr_b32 s14, s2, 16
+; GFX12-NEXT: v_ashrrev_i16 v4, 8, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX12-NEXT: s_bfe_i32 s20, s5, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v7, 8, s4
+; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000
+; GFX12-NEXT: s_lshr_b32 s17, s1, 16
+; GFX12-NEXT: s_lshr_b32 s15, s3, 16
+; GFX12-NEXT: s_lshr_b32 s16, s0, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s1
+; GFX12-NEXT: s_bfe_i32 s18, s1, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0
+; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v5, 8, s5
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[4:5], 56
+; GFX12-NEXT: v_and_b32_e64 v10, 0xffff, s2
+; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s20
+; GFX12-NEXT: s_bfe_i32 s1, s17, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v3, 8, s3
+; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
+; GFX12-NEXT: s_bfe_i32 s2, s15, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1
+; GFX12-NEXT: s_bfe_i32 s1, s12, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v2, 0xffff, s18
+; GFX12-NEXT: v_and_b32_e64 v6, 0xffff, s19
+; GFX12-NEXT: v_and_b32_e64 v8, 0xffff, s3
+; GFX12-NEXT: v_ashrrev_i16 v11, 8, s15
+; GFX12-NEXT: v_and_b32_e64 v13, 0xffff, s4
+; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s2
+; GFX12-NEXT: v_lshl_or_b32 v4, v4, 16, v10
+; GFX12-NEXT: v_lshl_or_b32 v10, v5, 16, v12
+; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s1
+; GFX12-NEXT: s_bfe_i32 s1, s7, 0x80000
+; GFX12-NEXT: s_lshr_b32 s11, s7, 16
+; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s1
+; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX12-NEXT: s_lshr_b32 s10, s6, 16
+; GFX12-NEXT: v_lshl_or_b32 v2, v0, 16, v2
+; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v6
+; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v8
+; GFX12-NEXT: v_lshl_or_b32 v8, v7, 16, v13
+; GFX12-NEXT: v_lshl_or_b32 v7, v11, 16, v15
+; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s1
+; GFX12-NEXT: s_bfe_i32 s1, s11, 0x80000
+; GFX12-NEXT: s_lshr_b32 s13, s5, 16
+; GFX12-NEXT: v_and_b32_e64 v22, 0xffff, s1
+; GFX12-NEXT: s_bfe_i32 s1, s10, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v9, 8, s17
+; GFX12-NEXT: s_bfe_i32 s3, s14, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v11, 8, s7
+; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6
+; GFX12-NEXT: v_ashrrev_i16 v21, 8, s11
+; GFX12-NEXT: v_ashrrev_i16 v23, 8, s10
+; GFX12-NEXT: v_and_b32_e64 v24, 0xffff, s1
+; GFX12-NEXT: s_bfe_i32 s5, s16, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, s12
+; GFX12-NEXT: v_ashrrev_i16 v18, 8, s14
+; GFX12-NEXT: s_bfe_i32 s4, s13, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v20, 0xffff, s3
+; GFX12-NEXT: v_ashrrev_i16 v17, 8, s16
+; GFX12-NEXT: v_and_b32_e64 v19, 0xffff, s5
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s4, s0
+; GFX12-NEXT: v_mov_b32_e32 v16, 0
+; GFX12-NEXT: v_lshl_or_b32 v3, v9, 16, v14
+; GFX12-NEXT: v_lshl_or_b32 v14, v11, 16, v12
+; GFX12-NEXT: v_mov_b32_e32 v11, s0
+; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v15
+; GFX12-NEXT: v_lshl_or_b32 v15, v21, 16, v22
+; GFX12-NEXT: v_lshl_or_b32 v13, v23, 16, v24
+; GFX12-NEXT: v_lshl_or_b32 v9, v1, 16, v5
+; GFX12-NEXT: v_lshl_or_b32 v5, v18, 16, v20
+; GFX12-NEXT: v_lshl_or_b32 v1, v17, 16, v19
+; GFX12-NEXT: s_clause 0x3
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = sext <32 x i8> %load to <32 x i16>
store <32 x i16> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index 7cd6635a62113d..08e0f2e58a369d 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX11
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12
---
name: merge_s_load_x1_x1
@@ -16,18 +17,76 @@ body: |
...
---
-name: merge_s_load_x1_x1_x1_x1
+name: merge_s_load_x1_x2
body: |
bb.0:
- ; CHECK-LABEL: name: merge_s_load_x1_x1_x1_x1
+ ; CHECK-LABEL: name: merge_s_load_x1_x2
; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 4, 0 :: (dereferenceable invariant load (s64))
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x1_x3
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: merge_s_load_x1_x3
+ ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16)
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96))
+...
+
+---
+name: merge_s_load_x1_x1_x1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: merge_s_load_x1_x1_x1
+ ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-LABEL: name: merge_s_load_x1_x1_x1
+ ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
+ %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: merge_s_load_x1_x1_x1_x1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1
+ ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
+ ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1
+ ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1
%0:sgpr_64 = IMPLICIT_DEF
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
%2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
@@ -39,23 +98,40 @@ body: |
name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
body: |
bb.0:
- ; CHECK-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
- ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1
+ ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
+ ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0
+ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1
+ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0
+ ; GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1
+ ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
+ ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY4]].sub0
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY4]].sub1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[COPY1]].sub0_sub1_sub2
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub3
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]].sub0_sub1
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub2
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY10]].sub0
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY10]].sub1
%0:sgpr_64 = IMPLICIT_DEF
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
%2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
@@ -67,6 +143,24 @@ body: |
%8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 28, 0 :: (dereferenceable invariant load (s32))
...
+---
+name: merge_s_load_x2_x1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: merge_s_load_x2_x1
+ ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+ ; GFX12-LABEL: name: merge_s_load_x2_x1
+ ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
+...
+
---
name: merge_s_load_x2_x2
body: |
@@ -101,6 +195,20 @@ body: |
%4:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64))
...
+---
+name: merge_s_load_x3_x1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: merge_s_load_x3_x1
+ ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
+...
+
---
name: merge_s_load_x4_x4
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
index 5ff0dbe65b5d12..c739c3caf1eb3f 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
@@ -1,8 +1,8 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX10
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12
# CHECK-LABEL: name: merge_s_buffer_load_x2
# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
-
name: merge_s_buffer_load_x2
tracksRegLiveness: true
body: |
@@ -17,6 +17,41 @@ body: |
...
---
+# CHECK-LABEL: name: merge_s_buffer_load_x1_x2
+# CHECK: S_BUFFER_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32))
+# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 4, 0 :: (dereferenceable invariant load (s64))
+name: merge_s_buffer_load_x1_x2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x2_x1
+# GFX10: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64))
+# GFX10: S_BUFFER_LOAD_DWORD_IMM %0, 8, 0 :: (dereferenceable invariant load (s32))
+# GFX12: S_BUFFER_LOAD_DWORDX3_IMM %0, 0, 0 :: (dereferenceable invariant load (s96), align 8)
+name: merge_s_buffer_load_x2_x1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
# CHECK-LABEL: name: merge_s_buffer_load_x4
# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
name: merge_s_buffer_load_x4
@@ -35,6 +70,39 @@ body: |
...
---
+# CHECK-LABEL: name: merge_s_buffer_load_x1_x3
+# CHECK: S_BUFFER_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32))
+# CHECK: S_BUFFER_LOAD_DWORDX3_IMM %0, 4, 0 :: (dereferenceable invariant load (s96), align 16)
+name: merge_s_buffer_load_x1_x3
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s96))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x3_x1
+# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128))
+name: merge_s_buffer_load_x3_x1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s96))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
# CHECK-LABEL: name: merge_s_buffer_load_x8
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
name: merge_s_buffer_load_x8
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 098a4cbb36ede9..4695cadd45aeed 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -95,9 +95,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
;
; GFX12-LABEL: s_sub_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
>From a07f1337ea134fbbaba8f05bdc2068ca1d216a69 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Fri, 15 Dec 2023 11:32:54 +0100
Subject: [PATCH 2/2] test updates
---
.../AMDGPU/GlobalISel/load-constant.96.ll | 263 ++++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 32 +--
2 files changed, 271 insertions(+), 24 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index c28d204784d4b2..4853bb309c1bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1,4 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
@@ -7,6 +9,53 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: s_clause 0xb
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
+; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
+; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -166,6 +215,31 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,6 +330,13 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
+; GFX12-LABEL: v_load_constant_v3i32_align4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_load_constant_v3i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -291,6 +372,13 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
}
define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
+; GFX12-LABEL: v_load_constant_i96_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_load_constant_i96_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -326,6 +414,13 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
+; GFX12-LABEL: v_load_constant_v3i32_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_load_constant_v3i32_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -361,6 +456,13 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
}
define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
+; GFX12-LABEL: v_load_constant_v6i16_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_load_constant_v6i16_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -405,6 +507,25 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
}
define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
+; GFX12-LABEL: v_load_constant_v12i8_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_load_constant_v12i8_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,6 +596,13 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
+; GFX12-LABEL: v_load_constant_v3i32_align16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_load_constant_v3i32_align16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -506,6 +634,60 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg %ptr) {
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NOUNALIGNED-NEXT: s_clause 0xb
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v1, v0, s[0:1]
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v0, s[0:1] offset:1
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v0, s[0:1] offset:2
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v0, s[0:1] offset:3
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v5, v0, s[0:1] offset:4
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v6, v0, s[0:1] offset:5
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v7, v0, s[0:1] offset:6
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v0, s[0:1] offset:7
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v9, v0, s[0:1] offset:8
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v0, s[0:1] offset:9
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v0, s[0:1] offset:11
+; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v0, s[0:1] offset:10
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v1
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v8
+; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v3, v1
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v10, 8, v9
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v5, v6, v4
+; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
+; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v8, v0, v7
+; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
@@ -674,6 +856,38 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg %ptr) {
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v0, s[0:1] offset:2
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v0, s[0:1] offset:4
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v0, s[0:1] offset:6
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v0, s[0:1] offset:8
+; GFX12-NOUNALIGNED-NEXT: global_load_u16 v0, v0, s[0:1] offset:10
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v5
+; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
+; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
@@ -773,6 +987,12 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
+; GFX12-LABEL: s_load_constant_v3i32_align4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_load_constant_v3i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -804,6 +1024,12 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
}
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
+; GFX12-LABEL: s_load_constant_i96_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_load_constant_i96_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -835,6 +1061,12 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
+; GFX12-LABEL: s_load_constant_v3i32_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_load_constant_v3i32_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -866,6 +1098,12 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
+; GFX12-LABEL: s_load_constant_v6i16_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_load_constant_v6i16_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -898,6 +1136,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
}
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
+; GFX12-LABEL: s_load_constant_v12i8_align8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_lshr_b32 s13, s0, 8
+; GFX12-NEXT: s_lshr_b32 s12, s0, 16
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_lshr_b32 s5, s1, 8
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_lshr_b32 s7, s1, 24
+; GFX12-NEXT: s_lshr_b32 s9, s2, 8
+; GFX12-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NEXT: s_lshr_b32 s11, s2, 24
+; GFX12-NEXT: s_mov_b32 s4, s1
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s1, s13
+; GFX12-NEXT: s_mov_b32 s2, s12
+; GFX12-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_load_constant_v12i8_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
@@ -956,6 +1213,12 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
}
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) {
+; GFX12-LABEL: s_load_constant_v3i32_align16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
; GCN-LABEL: s_load_constant_v3i32_align16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index f202326b873b1f..d630ba946dca34 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -85,9 +85,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
;
; GFX12-LABEL: v_permlane16_b32_vii:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -133,9 +131,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
;
; GFX12-LABEL: v_permlane16_b32_vll:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
@@ -207,9 +203,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
;
; GFX12-SDAG-LABEL: v_permlane16_b32_vvv:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -227,9 +221,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
;
; GFX12-GISEL-LABEL: v_permlane16_b32_vvv:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
@@ -629,9 +621,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
;
; GFX12-LABEL: v_permlanex16_b32_vii:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -677,9 +667,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
;
; GFX12-LABEL: v_permlanex16_b32_vll:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
@@ -751,9 +739,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
;
; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -771,9 +757,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
;
; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
More information about the llvm-commits
mailing list