[llvm] [AMDGPU] CodeGen for GFX12 8/16-bit SMEM loads (PR #77633)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 12 04:12:53 PST 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77633
>From 3cb0144068d48759c7ed3634905462fbb80d3082 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 8 Jan 2024 16:20:11 +0000
Subject: [PATCH 1/3] [AMDGPU] CodeGen for GFX12 8/16-bit SMEM loads
---
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 5 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 4 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 30 +-
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 6 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 26 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 156 +++-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 27 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 9 +-
llvm/lib/Target/AMDGPU/SMInstructions.td | 77 +-
.../AMDGPU/GlobalISel/load-constant.96.ll | 100 +--
.../AMDGPU/GlobalISel/regbankselect-load.mir | 36 +-
.../AMDGPU/gfx12_scalar_subword_loads.ll | 766 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 240 +++---
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 79 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 24 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 30 +-
20 files changed, 1326 insertions(+), 302 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2b85024a9b40be..7d829d3b867c8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -265,6 +265,10 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>;
def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 119aa80b9bb5d5..6166d14c79ea03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3190,10 +3190,11 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
auto Ld = cast<LoadSDNode>(N);
- if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
+ const MachineMemOperand *MMO = Ld->getMemOperand();
+ if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
return false;
- return Ld->getAlign() >= Align(4) &&
+ return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0dbcaf5a1b136c..d7bc794c71dac1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5453,6 +5453,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(SBUFFER_LOAD)
+ NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
+ NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
+ NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
+ NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_BYTE)
NODE_NAME_CASE(BUFFER_STORE_SHORT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 827fb106b55199..0f758bdb3182e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -567,6 +567,10 @@ enum NodeType : unsigned {
BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
SBUFFER_LOAD,
+ SBUFFER_LOAD_BYTE,
+ SBUFFER_LOAD_UBYTE,
+ SBUFFER_LOAD_SHORT,
+ SBUFFER_LOAD_USHORT,
BUFFER_STORE,
BUFFER_STORE_BYTE,
BUFFER_STORE_SHORT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index aa235c07e99597..6cd4ac0dbef02f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6443,15 +6443,28 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return true;
}
-bool AMDGPULegalizerInfo::legalizeSBufferLoad(
- LegalizerHelper &Helper, MachineInstr &MI) const {
+bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
GISelChangeObserver &Observer = Helper.Observer;
- Register Dst = MI.getOperand(0).getReg();
- LLT Ty = B.getMRI()->getType(Dst);
+ Register OrigDst = MI.getOperand(0).getReg();
+ Register Dst;
+ LLT Ty = B.getMRI()->getType(OrigDst);
unsigned Size = Ty.getSizeInBits();
MachineFunction &MF = B.getMF();
+ unsigned Opc = 0;
+ if (Size < 32 && ST.hasScalarSubwordLoads()) {
+ assert(Size == 8 || Size == 16);
+ Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
+ : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
+ // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
+ // destination register.
+ Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+ } else {
+ Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
+ Dst = OrigDst;
+ }
Observer.changingInstr(MI);
@@ -6469,19 +6482,24 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
- MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
+ MI.setDesc(B.getTII().get(Opc));
MI.removeOperand(1); // Remove intrinsic ID
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
// TODO: Should this use datalayout alignment?
const unsigned MemSize = (Size + 7) / 8;
- const Align MemAlign(4);
+ const Align MemAlign(std::min(MemSize, 4u));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
+ if (Dst != OrigDst) {
+ MI.getOperand(0).setReg(Dst);
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ B.buildTrunc(OrigDst, Dst);
+ }
// If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index bb1d6cb72e8071..a1c34e92a57f35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -411,6 +411,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
return Width == 16;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
+ return Width == 8;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
+ return Width == 16;
}
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 391c2b9ec256ea..f96328b34935f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -449,8 +449,13 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
+ const unsigned MemSize = 8 * MMO->getSize();
+
// Require 4-byte alignment.
- return MMO->getAlign() >= Align(4) &&
+ return (MMO->getAlign() >= Align(4) ||
+ (Subtarget.hasScalarSubwordLoads() &&
+ ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
+ (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
@@ -1074,6 +1079,13 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
(MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
return false;
+ if (LoadSize == 32 &&
+ ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
+ (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
+ isScalarLoadLegal(MI) &&
+ Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
+ return false;
+
Register PtrReg = MI.getOperand(1).getReg();
ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
@@ -3073,7 +3085,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(B, MI, {3, 6});
return;
}
- case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
applyMappingSBufferLoad(B, OpdMapper);
return;
}
@@ -4396,7 +4412,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// initialized.
break;
}
- case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
// Lie and claim everything is legal, even though some need to be
// SGPRs. applyMapping will have to deal with it as a waterfall loop.
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..e3c4a699afe79f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -423,6 +423,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return GFX9Insts;
}
+ bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6ddc7e864fb23c..945634176299f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -855,7 +855,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16, MVT::i128},
+ MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
@@ -5720,7 +5720,7 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
if (isTypeLegal(LoadVT)) {
return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
@@ -5894,6 +5894,56 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
+ // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
+ // combiner tries to merge the s_buffer_load_u8 with a sext instruction
+ // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
+ // s_buffer_load_i8.
+ assert(Subtarget->hasScalarSubwordLoads() &&
+ "s_buffer_load_{u8, i8} are supported "
+ "in GFX12 (or newer) architectures.");
+ SDValue Op = SDValue(N, 0);
+ SDValue Rsrc = Op.getOperand(1);
+ SDValue Offset = Op.getOperand(2);
+ SDValue CachePolicy = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ Align Alignment =
+ DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), Alignment);
+ SDValue LoadVal;
+ if (!Offset->isDivergent()) {
+ SDValue Ops[] = {Rsrc, // source register
+ Offset, CachePolicy};
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
+ DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ } else {
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+ setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
+ LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ }
+ Results.push_back(LoadVal);
+ return;
+ }
}
break;
}
@@ -7751,11 +7801,18 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
VT.getStoreSize(), Alignment);
if (!Offset->isDivergent()) {
- SDValue Ops[] = {
- Rsrc,
- Offset, // Offset
- CachePolicy
- };
+ SDValue Ops[] = {Rsrc, Offset, CachePolicy};
+
+ // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
+ // s_buffer_load_u16 instruction is emitted for both signed and unsigned
+ // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
+ // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
+ DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ }
// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
@@ -7776,6 +7833,21 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
// We have a divergent offset. Emit a MUBUF buffer load instead. We can
// assume that the buffer is unswizzled.
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
+ return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ }
+
SmallVector<SDValue, 4> Loads;
unsigned NumLoads = 1;
MVT LoadVT = VT.getSimpleVT();
@@ -7789,16 +7861,6 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
}
SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
- SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- CachePolicy, // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
- };
// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
@@ -8378,9 +8440,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M, DAG, Ops);
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+ if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
+ M->getMemOperand());
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand(), DAG);
@@ -9770,18 +9832,17 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
}
// Handle 8 bit and 16 bit buffer loads
-SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
- EVT LoadVT, SDLoc DL,
- ArrayRef<SDValue> Ops,
- MemSDNode *M) const {
+SDValue
+SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
+ SDLoc DL, ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO) const {
EVT IntVT = LoadVT.changeTypeToInteger();
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
- SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
- Ops, IntVT,
- M->getMemOperand());
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
@@ -12062,17 +12123,42 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
return SDValue();
}
-SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
- DAGCombinerInfo &DCI)
- const {
+SDValue
+SITargetLowering::performSignExtendInRegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SDValue Src = N->getOperand(0);
auto *VTSign = cast<VTSDNode>(N->getOperand(1));
- if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
- VTSign->getVT() == MVT::i8) ||
- (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
- VTSign->getVT() == MVT::i16)) &&
- Src.hasOneUse()) {
+ // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
+ // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
+ if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16))) {
+ assert(Subtarget->hasScalarSubwordLoads() &&
+ "s_buffer_load_{u8, i8} are supported "
+ "in GFX12 (or newer) architectures.");
+ EVT VT = Src.getValueType();
+ unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
+ ? AMDGPUISD::SBUFFER_LOAD_BYTE
+ : AMDGPUISD::SBUFFER_LOAD_SHORT;
+ SDLoc DL(N);
+ SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
+ SDValue Ops[] = {
+ Src.getOperand(0), // source register
+ Src.getOperand(1), // offset
+ Src.getOperand(2) // cachePolicy
+ };
+ auto *M = cast<MemSDNode>(Src);
+ SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
+ Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
+ SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ return LoadVal;
+ } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16)) &&
+ Src.hasOneUse()) {
auto *M = cast<MemSDNode>(Src);
SDValue Ops[] = {
Src.getOperand(0), // Chain
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 92b38ebade6217..d66ba0b59ba906 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -273,7 +273,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
- ArrayRef<SDValue> Ops, MemSDNode *M) const;
+ ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO) const;
// Handle 8 bit and 16 bit buffer stores
SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 04c92155f5aada..18dcdb0a9b307e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -41,10 +41,29 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
- SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
- [SDNPMayLoad, SDNPMemOperand]
->;
+def SDTSBufferLoad : SDTypeProfile<1, 3,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // offset(imm)
+ SDTCisVT<3, i32>]>; // cachepolicy
+
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_byte : SDNode<"AMDGPUISD::SBUFFER_LOAD_BYTE", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_ubyte
+ : SDNode<"AMDGPUISD::SBUFFER_LOAD_UBYTE", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_short
+ : SDNode<"AMDGPUISD::SBUFFER_LOAD_SHORT", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_ushort
+ : SDNode<"AMDGPUISD::SBUFFER_LOAD_USHORT", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b4bd46d33c1f10..22ac198eaa725b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3877,7 +3877,8 @@ def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
// a workaround for the intrinsic being defined as readnone, but
// really needs a memory operand.
-def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
+
+class SBufferLoadInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
let hasSideEffects = 0;
@@ -3885,6 +3886,12 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
let mayStore = 0;
}
+def G_AMDGPU_S_BUFFER_LOAD : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_SBYTE : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction;
+
def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src0, type0:$src1);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index fc29ce8d71f2c2..9a27d22d585ecb 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -810,7 +810,7 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
// Scalar Memory Patterns
//===----------------------------------------------------------------------===//
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> {
+class SMRDLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{ return isUniformLoad(N);}]> {
let GISelPredicateCode = [{
if (!MI.hasOneMemOperand())
return false;
@@ -827,6 +827,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
}];
}
+def smrd_load : SMRDLoadPat<load>;
+def smrd_extloadi8 : SMRDLoadPat<extloadi8>;
+def smrd_zextloadi8 : SMRDLoadPat<zextloadi8>;
+def smrd_sextloadi8 : SMRDLoadPat<sextloadi8>;
+def smrd_extloadi16 : SMRDLoadPat<extloadi16>;
+def smrd_zextloadi16 : SMRDLoadPat<zextloadi16>;
+def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
+
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
[{ return !N->getOperand(1)->isDivergent();}]> {
@@ -923,11 +931,78 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
}
}
+multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, ValueType vt> {
+ // 1. IMM offset
+ def : GCNPat <
+ (node (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 2. SGPR offset
+ def : GCNPat <
+ (node (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 3. SGPR+IMM offset
+ def : GCNPat <
+ (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (node (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+}
+
+multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
+
+ // 1. Offset as an immediate
+ def : GCNPat <
+ (name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
+ (i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 2. Offset as an 32-bit SGPR
+ def : GCNPat <
+ (name v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
+ (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 3. Offset as an 32-bit SGPR + immediate
+ def : GCNPat <
+ (name v4i32:$sbase, (SMRDBufferSgprImm i32:$soffset, i32:$offset),
+ timm:$cachepolicy),
+ (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
+ (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX12Plus];
+ }
+}
+
// Global and constant loads can be selected to either MUBUF or SMRD
// instructions, but SMRD instructions are faster so we want the instruction
// selector to prefer those.
let AddedComplexity = 100 in {
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U8", smrd_extloadi8, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U8", smrd_zextloadi8, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_I8", smrd_sextloadi8, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U16", smrd_extloadi16, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U16", smrd_zextloadi16, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_I16", smrd_sextloadi16, i32>;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_byte, "S_BUFFER_LOAD_I8">;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ubyte, "S_BUFFER_LOAD_U8">;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_short, "S_BUFFER_LOAD_I16">;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ushort, "S_BUFFER_LOAD_U16">;
+
foreach vt = Reg32Types.types in {
defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 4853bb309c1bb6..d527a51aad8ae1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -646,46 +646,38 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
;
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX12-NOUNALIGNED: ; %bb.0:
-; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NOUNALIGNED-NEXT: s_clause 0xb
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v1, v0, s[0:1]
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v0, s[0:1] offset:1
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v0, s[0:1] offset:2
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v0, s[0:1] offset:3
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v5, v0, s[0:1] offset:4
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v6, v0, s[0:1] offset:5
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v7, v0, s[0:1] offset:6
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v0, s[0:1] offset:7
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v9, v0, s[0:1] offset:8
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v0, s[0:1] offset:9
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v0, s[0:1] offset:11
-; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v0, s[0:1] offset:10
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v1
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v4
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v8
-; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v3, v1
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v10, 8, v9
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v5, v6, v4
-; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v8, v0, v7
-; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
-; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s5, s[0:1], 0x5
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s6, s[0:1], 0x7
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s7, s[0:1], 0x6
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s8, s[0:1], 0x9
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s9, s[0:1], 0xb
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
+; GFX12-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -868,24 +860,20 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
;
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX12-NOUNALIGNED: ; %bb.0:
-; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NOUNALIGNED-NEXT: s_clause 0x5
-; GFX12-NOUNALIGNED-NEXT: global_load_u16 v1, v0, s[0:1]
-; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v0, s[0:1] offset:2
-; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v0, s[0:1] offset:4
-; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v0, s[0:1] offset:6
-; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v0, s[0:1] offset:8
-; GFX12-NOUNALIGNED-NEXT: global_load_u16 v0, v0, s[0:1] offset:10
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v5
-; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2
-; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2
+; GFX12-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6
+; GFX12-NOUNALIGNED-NEXT: s_load_u16 s4, s[0:1], 0xa
+; GFX12-NOUNALIGNED-NEXT: s_load_u16 s5, s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT: s_load_u16 s6, s[0:1], 0x4
+; GFX12-NOUNALIGNED-NEXT: s_load_u16 s7, s[0:1], 0x8
+; GFX12-NOUNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 16
+; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s5
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s6
+; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index 442902c9fc8f55..74af51f0c1676d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -529,12 +529,18 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; GCN-LABEL: name: extload_constant_i8_to_i32_uniform
- ; GCN: liveins: $sgpr0_sgpr1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; GFX7-LABEL: name: extload_constant_i8_to_i32_uniform
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ;
+ ; GFX12-LABEL: name: extload_constant_i8_to_i32_uniform
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 4, align 1)
...
@@ -565,12 +571,18 @@ body: |
bb.0:
liveins: $sgpr0_sgpr1
- ; GCN-LABEL: name: extload_constant_i16_to_i32_uniform
- ; GCN: liveins: $sgpr0_sgpr1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; GFX7-LABEL: name: extload_constant_i16_to_i32_uniform
+ ; GFX7: liveins: $sgpr0_sgpr1
+ ; GFX7-NEXT: {{ $}}
+ ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+ ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ;
+ ; GFX12-LABEL: name: extload_constant_i16_to_i32_uniform
+ ; GFX12: liveins: $sgpr0_sgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 4, align 2)
...
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
new file mode 100644
index 00000000000000..c4c0087dcdbb86
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -0,0 +1,766 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i8 s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %ld = load i8, ptr addrspace(4) %in
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i8_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i8 s0, s[0:1], -0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100
+ %ld = load i8, ptr addrspace(4) %gep
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i8_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i8_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i8 s0, s[0:1], s2 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %zext = zext i32 %offset to i64
+ %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext
+ %ld = load i8, ptr addrspace(4) %gep
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i8_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i8_sgpr_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
+ %zext = zext i32 %offset to i64
+ %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext
+ %ld = load i8, ptr addrspace(4) %gep2
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i8_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i8_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_load_i8 v0, v0, s[0:1] offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
+ %zext = zext i32 %offset to i64
+ %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext
+ %ld = load i8, ptr addrspace(4) %gep2
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %ld = load i8, ptr addrspace(4) %in
+ %zext = zext i8 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u8_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u8 s0, s[0:1], 0xff
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep = getelementptr i8, ptr addrspace(4) %in, i64 255
+ %ld = load i8, ptr addrspace(4) %gep
+ %zext = zext i8 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u8_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u8_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %zext1 = zext i32 %offset to i64
+ %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1
+ %ld = load i8, ptr addrspace(4) %gep
+ %zext2 = zext i8 %ld to i32
+ store i32 %zext2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u8_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u8_sgpr_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
+ %zext1= zext i32 %offset to i64
+ %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext1
+ %ld = load i8, ptr addrspace(4) %gep2
+ %zext2= zext i8 %ld to i32
+ store i32 %zext2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u8_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u8_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
+ %zext1= zext i32 %offset to i64
+ %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext1
+ %ld = load i8, ptr addrspace(4) %gep2
+ %zext2= zext i8 %ld to i32
+ store i32 %zext2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i16 s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %ld = load i16, ptr addrspace(4) %in
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i16_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i16 s0, s[0:1], -0xc8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100
+ %ld = load i16, ptr addrspace(4) %gep
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i16_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_i16_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_i16 s0, s[0:1], s2 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %zext = zext i32 %offset to i64
+ %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext
+ %ld = load i16, ptr addrspace(4) %gep
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i16_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; DAG-LABEL: test_s_load_i16_sgpr_imm:
+; DAG: ; %bb.0:
+; DAG-NEXT: s_mov_b32 s3, 0
+; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; DAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; DAG-NEXT: s_load_i16 s0, s[0:1], 0x20
+; DAG-NEXT: s_waitcnt lgkmcnt(0)
+; DAG-NEXT: v_mov_b32_e32 v2, s0
+; DAG-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_s_load_i16_sgpr_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s3, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_i16 s0, s[0:1], 0x20
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
+ %zext = zext i32 %offset to i64
+ %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext
+ %ld = load i16, ptr addrspace(4) %gep2
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
+; DAG-LABEL: test_s_load_i16_divergent:
+; DAG: ; %bb.0:
+; DAG-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0
+; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DAG-NEXT: v_lshlrev_b64_e32 v[3:4], 1, v[3:4]
+; DAG-NEXT: v_add_co_u32 v3, vcc_lo, s0, v3
+; DAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAG-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s1, v4, vcc_lo
+; DAG-NEXT: global_load_i16 v0, v[3:4], off offset:32
+; DAG-NEXT: s_waitcnt vmcnt(0)
+; DAG-NEXT: global_store_b32 v[1:2], v0, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_s_load_i16_divergent:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v6, s1
+; GISEL-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1]
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo
+; GISEL-NEXT: global_load_i16 v0, v[0:1], off offset:32
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_b32 v[3:4], v0, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
+ %zext = zext i32 %offset to i64
+ %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext
+ %ld = load i16, ptr addrspace(4) %gep2
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u16(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u16 s0, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %ld = load i16, ptr addrspace(4) %in
+ %zext = zext i16 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u16_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u16 s0, s[0:1], 0x1fe
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %gep = getelementptr i16, ptr addrspace(4) %in, i64 255
+ %ld = load i16, ptr addrspace(4) %gep
+ %zext = zext i16 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u16_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; GCN-LABEL: test_s_load_u16_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+ %zext1 = zext i32 %offset to i64
+ %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1
+ %ld = load i16, ptr addrspace(4) %gep
+ %zext2 = zext i16 %ld to i32
+ store i32 %zext2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u16_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
+; DAG-LABEL: test_s_load_u16_sgpr_imm:
+; DAG: ; %bb.0:
+; DAG-NEXT: s_mov_b32 s3, 0
+; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; DAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; DAG-NEXT: s_load_u16 s0, s[0:1], 0x20
+; DAG-NEXT: s_waitcnt lgkmcnt(0)
+; DAG-NEXT: v_mov_b32_e32 v2, s0
+; DAG-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_s_load_u16_sgpr_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s3, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_u16 s0, s[0:1], 0x20
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
+ %zext1= zext i32 %offset to i64
+ %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext1
+ %ld = load i16, ptr addrspace(4) %gep2
+ %zext2= zext i16 %ld to i32
+ store i32 %zext2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
+; DAG-LABEL: test_s_load_u16_divergent:
+; DAG: ; %bb.0:
+; DAG-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0
+; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DAG-NEXT: v_lshlrev_b64_e32 v[3:4], 1, v[3:4]
+; DAG-NEXT: v_add_co_u32 v3, vcc_lo, s0, v3
+; DAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAG-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s1, v4, vcc_lo
+; DAG-NEXT: global_load_u16 v0, v[3:4], off offset:32
+; DAG-NEXT: s_waitcnt vmcnt(0)
+; DAG-NEXT: global_store_b32 v[1:2], v0, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_s_load_u16_divergent:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v6, s1
+; GISEL-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1]
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo
+; GISEL-NEXT: global_load_u16 v0, v[0:1], off offset:32
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_b32 v[3:4], v0, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+ %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
+ %zext1= zext i32 %offset to i64
+ %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext1
+ %ld = load i16, ptr addrspace(4) %gep2
+ %zext2= zext i16 %ld to i32
+ store i32 %zext2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
+; GCN-LABEL: s_buffer_load_byte_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
+; GCN-LABEL: s_buffer_load_byte_sgpr:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
+; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %off = add nuw nsw i32 %in, 100
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %off, i32 0)
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
+; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
+; DAG: ; %bb.0: ; %main_body
+; DAG-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
+; DAG-NEXT: s_waitcnt vmcnt(0)
+; DAG-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+main_body:
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
+ %sext = sext i8 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
+; GCN-LABEL: s_buffer_load_ubyte_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, 0xff
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
+ %zext = zext i8 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
+; GCN-LABEL: s_buffer_load_ubyte_sgpr:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, 0xff
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
+ %zext = zext i8 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
+; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, 0xff
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %off = add nuw nsw i32 %in, 100
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %off, i32 0)
+ %zext = zext i8 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
+; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
+; DAG: ; %bb.0: ; %main_body
+; DAG-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
+; DAG-NEXT: s_waitcnt vmcnt(0)
+; DAG-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+main_body:
+ %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
+ %zext = zext i8 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
+; GCN-LABEL: s_buffer_load_short_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
+; GCN-LABEL: s_buffer_load_short_sgpr:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
+; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %off = add nuw nsw i32 %in, 100
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %off, i32 0)
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
+; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
+; DAG: ; %bb.0: ; %main_body
+; DAG-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
+; DAG-NEXT: s_waitcnt vmcnt(0)
+; DAG-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+main_body:
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
+ %sext = sext i16 %ld to i32
+ store i32 %sext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
+; GCN-LABEL: s_buffer_load_ushort_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
+ %zext = zext i16 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
+; GCN-LABEL: s_buffer_load_ushort_sgpr:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
+ %zext = zext i16 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
+; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+main_body:
+ %off = add nuw nsw i32 %in, 100
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %off, i32 0)
+ %zext = zext i16 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
+; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
+; DAG: ; %bb.0: ; %main_body
+; DAG-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
+; DAG-NEXT: s_waitcnt vmcnt(0)
+; DAG-NEXT: global_store_b32 v[0:1], v2, off
+; DAG-NEXT: s_nop 0
+; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+main_body:
+ %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
+ %zext = zext i16 %ld to i32
+ store i32 %zext, ptr addrspace(1) %out
+ ret void
+}
+
+declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32)
+declare i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32>, i32, i32)
+declare i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32>, i32, i32)
+declare i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32>, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index b2b3f3e1bfbd96..46267c93646f47 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -29,9 +29,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sb
;
; GFX12-LABEL: global_load_saddr_i8_offset_0:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%load = load i8, ptr addrspace(1) %sbase
%zext = zext i8 %load to i32
@@ -64,9 +64,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg
;
; GFX12-LABEL: global_load_saddr_i8_offset_4095:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0xfff
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
%load = load i8, ptr addrspace(1) %gep0
@@ -93,9 +93,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg
;
; GFX12-LABEL: global_load_saddr_i8_offset_4096:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4096
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x1000
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
%load = load i8, ptr addrspace(1) %gep0
@@ -122,9 +122,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg
;
; GFX12-LABEL: global_load_saddr_i8_offset_4097:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4097
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x1001
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
%load = load i8, ptr addrspace(1) %gep0
@@ -159,9 +159,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr
;
; GFX12-LABEL: global_load_saddr_i8_offset_neg4096:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1000
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
%load = load i8, ptr addrspace(1) %gep0
@@ -200,9 +200,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr
;
; GFX12-LABEL: global_load_saddr_i8_offset_neg4097:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4097
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1001
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
%load = load i8, ptr addrspace(1) %gep0
@@ -241,9 +241,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr
;
; GFX12-LABEL: global_load_saddr_i8_offset_neg4098:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4098
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1002
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
%load = load i8, ptr addrspace(1) %gep0
@@ -277,9 +277,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg
;
; GFX12-LABEL: global_load_saddr_i8_offset_2048:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x800
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
%load = load i8, ptr addrspace(1) %gep0
@@ -313,9 +313,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg
;
; GFX12-LABEL: global_load_saddr_i8_offset_2049:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x801
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
%load = load i8, ptr addrspace(1) %gep0
@@ -349,9 +349,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg
;
; GFX12-LABEL: global_load_saddr_i8_offset_2050:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x802
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
%load = load i8, ptr addrspace(1) %gep0
@@ -378,9 +378,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr
;
; GFX12-LABEL: global_load_saddr_i8_offset_neg2048:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x800
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
%load = load i8, ptr addrspace(1) %gep0
@@ -415,9 +415,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr
;
; GFX12-LABEL: global_load_saddr_i8_offset_neg2049:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x801
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
%load = load i8, ptr addrspace(1) %gep0
@@ -452,9 +452,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr
;
; GFX12-LABEL: global_load_saddr_i8_offset_neg2050:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x802
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
%load = load i8, ptr addrspace(1) %gep0
@@ -487,9 +487,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) in
;
; GFX12-LABEL: global_load_saddr_i8_offset_0x7FFFFF:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x7fffff
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607
%load = load i8, ptr addrspace(1) %gep0
@@ -527,9 +527,9 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in
;
; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8388608
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x800000
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608
%load = load i8, ptr addrspace(1) %gep0
@@ -562,9 +562,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
;
; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, 0xff800000
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_mov_b32 s0, -1
+; GFX12-NEXT: s_load_u8 s0, s[2:3], s0 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
%load = load i8, ptr addrspace(1) %gep0
@@ -602,21 +603,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1)
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: s_mov_b32 s1, 1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
%load = load i8, ptr addrspace(1) %gep0
@@ -654,21 +656,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:1
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_mov_b32 s1, s0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000001:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 1
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
%load = load i8, ptr addrspace(1) %gep0
@@ -706,21 +709,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1)
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000FFF:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xfff
+; GFX12-SDAG-NEXT: s_mov_b32 s1, 1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000FFF:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
%load = load i8, ptr addrspace(1) %gep0
@@ -758,21 +762,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1)
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100001000:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1000
+; GFX12-SDAG-NEXT: s_mov_b32 s1, 1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100001000:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
%load = load i8, ptr addrspace(1) %gep0
@@ -811,21 +816,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0x800000, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 1
+; GFX12-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 1
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
%load = load i8, ptr addrspace(1) %gep0
@@ -863,21 +869,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
%load = load i8, ptr addrspace(1) %gep0
@@ -915,21 +922,22 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000001:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1
-; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_mov_b32 s0, -1
+; GFX12-SDAG-NEXT: s_mov_b32 s1, -2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -1
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -2
+; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
%load = load i8, ptr addrspace(1) %gep0
@@ -1685,9 +1693,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1
;
; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1715,9 +1723,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a
;
; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:-0x18
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1746,9 +1754,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr
;
; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1778,9 +1786,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_
;
; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:0x80
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 4ed4034a0348f4..aaa3ed929d0355 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -66,11 +66,12 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
; GFX12-LABEL: constant_load_i1:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -611,10 +612,10 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
; GFX12-LABEL: constant_zextload_i1_to_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -678,11 +679,12 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
; GFX12-LABEL: constant_sextload_i1_to_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -743,10 +745,10 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
; GFX12-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -810,11 +812,12 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
; GFX12-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4453,11 +4456,12 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-LABEL: constant_zextload_i1_to_i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4524,13 +4528,12 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-LABEL: constant_sextload_i1_to_i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4596,11 +4599,12 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4667,13 +4671,12 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 585f96b9ffb2e6..6c5deb72740faf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -838,10 +838,10 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
; GFX12-LABEL: constant_zextload_i16_to_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -916,10 +916,10 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
; GFX12-LABEL: constant_sextload_i16_to_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_i16 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -993,10 +993,10 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GFX12-LABEL: constant_zextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1071,10 +1071,10 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GFX12-LABEL: constant_sextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_i16 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index f18a34515a8265..ba0e3552eecf4d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -583,10 +583,10 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX12-LABEL: constant_zextload_i8_to_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -661,10 +661,10 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX12-LABEL: constant_sextload_i8_to_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -738,10 +738,10 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX12-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -816,10 +816,10 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX12-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5437,10 +5437,10 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX12-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
>From b92efe805e1b49168a7b5841a7e94b374b0fbe90 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 11 Jan 2024 17:18:27 +0000
Subject: [PATCH 2/3] Fix assertion failure compiling i8 buffer load pre GFX12
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 945634176299f4..dd43a0dd679cc7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5900,9 +5900,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
// combiner tries to merge the s_buffer_load_u8 with a sext instruction
// (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
// s_buffer_load_i8.
- assert(Subtarget->hasScalarSubwordLoads() &&
- "s_buffer_load_{u8, i8} are supported "
- "in GFX12 (or newer) architectures.");
+ if (!Subtarget->hasScalarSubwordLoads())
+ return;
SDValue Op = SDValue(N, 0);
SDValue Rsrc = Op.getOperand(1);
SDValue Offset = Op.getOperand(2);
>From 463c48a6b228420361625c749bd226f68aaf8031 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 12 Jan 2024 12:12:41 +0000
Subject: [PATCH 3/3] Add TODO comments
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 2 ++
llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 +
4 files changed, 5 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d7bc794c71dac1..1df0fe78fe989e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -784,6 +784,7 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
unsigned AS = MN->getAddressSpace();
// Do not shrink an aligned scalar load to sub-dword.
// Scalar engine cannot do sub-dword loads.
+ // TODO: Update this for GFX12 which does have scalar sub-dword loads.
if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
(AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index fb7148ba10ac15..1983e9f8d4af71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -90,6 +90,8 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
+ // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
+
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c32303defe7f22..1c514ffa76c9d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -202,6 +202,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
// bits.
+ // TODO: Update this for GFX12 which does have scalar sub-dword loads.
//
// Additionally widen any sub-dword load to i32 even if suitably aligned,
// so that CSE between different argument loads works easily.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dd43a0dd679cc7..51c48041f7d202 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1525,6 +1525,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// will use a MUBUF load.
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
+ // TODO: Update this for GFX12 which does have scalar sub-dword loads.
if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
return isLegalGlobalAddressingMode(AM);
More information about the llvm-commits
mailing list