[llvm] [AMDGPU] CodeGen for GFX12 8/16-bit SMEM loads (PR #77633)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 10 09:31:28 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
---
Patch is 104.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77633.diff
20 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+3-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+24-6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (+6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+23-3)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+121-35)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+23-4)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+8-1)
- (modified) llvm/lib/Target/AMDGPU/SMInstructions.td (+76-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (+44-56)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir (+24-12)
- (added) llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll (+766)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+124-116)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+41-38)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+15-15)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2b85024a9b40be..7d829d3b867c8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -265,6 +265,10 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>;
def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 119aa80b9bb5d5..6166d14c79ea03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3190,10 +3190,11 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
auto Ld = cast<LoadSDNode>(N);
- if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
+ const MachineMemOperand *MMO = Ld->getMemOperand();
+ if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
return false;
- return Ld->getAlign() >= Align(4) &&
+ return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0dbcaf5a1b136c..d7bc794c71dac1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5453,6 +5453,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(SBUFFER_LOAD)
+ NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
+ NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
+ NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
+ NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_BYTE)
NODE_NAME_CASE(BUFFER_STORE_SHORT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 827fb106b55199..0f758bdb3182e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -567,6 +567,10 @@ enum NodeType : unsigned {
BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
SBUFFER_LOAD,
+ SBUFFER_LOAD_BYTE,
+ SBUFFER_LOAD_UBYTE,
+ SBUFFER_LOAD_SHORT,
+ SBUFFER_LOAD_USHORT,
BUFFER_STORE,
BUFFER_STORE_BYTE,
BUFFER_STORE_SHORT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index aa235c07e99597..6cd4ac0dbef02f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6443,15 +6443,28 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return true;
}
-bool AMDGPULegalizerInfo::legalizeSBufferLoad(
- LegalizerHelper &Helper, MachineInstr &MI) const {
+bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
GISelChangeObserver &Observer = Helper.Observer;
- Register Dst = MI.getOperand(0).getReg();
- LLT Ty = B.getMRI()->getType(Dst);
+ Register OrigDst = MI.getOperand(0).getReg();
+ Register Dst;
+ LLT Ty = B.getMRI()->getType(OrigDst);
unsigned Size = Ty.getSizeInBits();
MachineFunction &MF = B.getMF();
+ unsigned Opc = 0;
+ if (Size < 32 && ST.hasScalarSubwordLoads()) {
+ assert(Size == 8 || Size == 16);
+ Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
+ : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
+ // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
+ // destination register.
+ Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+ } else {
+ Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
+ Dst = OrigDst;
+ }
Observer.changingInstr(MI);
@@ -6469,19 +6482,24 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
- MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
+ MI.setDesc(B.getTII().get(Opc));
MI.removeOperand(1); // Remove intrinsic ID
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
// TODO: Should this use datalayout alignment?
const unsigned MemSize = (Size + 7) / 8;
- const Align MemAlign(4);
+ const Align MemAlign(std::min(MemSize, 4u));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
+ if (Dst != OrigDst) {
+ MI.getOperand(0).setReg(Dst);
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ B.buildTrunc(OrigDst, Dst);
+ }
// If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index bb1d6cb72e8071..a1c34e92a57f35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -411,6 +411,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
return Width == 16;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
+ return Width == 8;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
+ return Width == 16;
}
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 391c2b9ec256ea..f96328b34935f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -449,8 +449,13 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
+ const unsigned MemSize = 8 * MMO->getSize();
+
// Require 4-byte alignment.
- return MMO->getAlign() >= Align(4) &&
+ return (MMO->getAlign() >= Align(4) ||
+ (Subtarget.hasScalarSubwordLoads() &&
+ ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
+ (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
@@ -1074,6 +1079,13 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
(MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
return false;
+ if (LoadSize == 32 &&
+ ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
+ (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
+ isScalarLoadLegal(MI) &&
+ Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
+ return false;
+
Register PtrReg = MI.getOperand(1).getReg();
ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
@@ -3073,7 +3085,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(B, MI, {3, 6});
return;
}
- case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
applyMappingSBufferLoad(B, OpdMapper);
return;
}
@@ -4396,7 +4412,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// initialized.
break;
}
- case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
// Lie and claim everything is legal, even though some need to be
// SGPRs. applyMapping will have to deal with it as a waterfall loop.
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..e3c4a699afe79f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -423,6 +423,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return GFX9Insts;
}
+ bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6ddc7e864fb23c..945634176299f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -855,7 +855,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16, MVT::i128},
+ MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
@@ -5720,7 +5720,7 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
if (isTypeLegal(LoadVT)) {
return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
@@ -5894,6 +5894,56 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
+ // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
+ // combiner tries to merge the s_buffer_load_u8 with a sext instruction
+ // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
+ // s_buffer_load_i8.
+ assert(Subtarget->hasScalarSubwordLoads() &&
+ "s_buffer_load_{u8, i8} are supported "
+ "in GFX12 (or newer) architectures.");
+ SDValue Op = SDValue(N, 0);
+ SDValue Rsrc = Op.getOperand(1);
+ SDValue Offset = Op.getOperand(2);
+ SDValue CachePolicy = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ Align Alignment =
+ DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), Alignment);
+ SDValue LoadVal;
+ if (!Offset->isDivergent()) {
+ SDValue Ops[] = {Rsrc, // source register
+ Offset, CachePolicy};
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
+ DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ } else {
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+ setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
+ LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ }
+ Results.push_back(LoadVal);
+ return;
+ }
}
break;
}
@@ -7751,11 +7801,18 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
VT.getStoreSize(), Alignment);
if (!Offset->isDivergent()) {
- SDValue Ops[] = {
- Rsrc,
- Offset, // Offset
- CachePolicy
- };
+ SDValue Ops[] = {Rsrc, Offset, CachePolicy};
+
+ // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
+ // s_buffer_load_u16 instruction is emitted for both signed and unsigned
+ // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
+ // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
+ DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ }
// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
@@ -7776,6 +7833,21 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
// We have a divergent offset. Emit a MUBUF buffer load instead. We can
// assume that the buffer is unswizzled.
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
+ return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ }
+
SmallVector<SDValue, 4> Loads;
unsigned NumLoads = 1;
MVT LoadVT = VT.getSimpleVT();
@@ -7789,16 +7861,6 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
}
SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
- SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- CachePolicy, // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
- };
// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
@@ -8378,9 +8440,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M, DAG, Ops);
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+ if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
+ M->getMemOperand());
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand(), DAG);
@@ -9770,18 +9832,17 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
}
// Handle 8 bit and 16 bit buffer loads
-SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
- EVT LoadVT, SDLoc DL,
- ArrayRef<SDValue> Ops,
- MemSDNode *M) const {
+SDValue
+SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
+ SDLoc DL, ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO) const {
EVT IntVT = LoadVT.changeTypeToInteger();
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
- SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
- Ops, IntVT,
- M->getMemOperand());
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
@@ -12062,17 +12123,42 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
return SDValue();
}
-SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
- DAGCombinerInfo &DCI)
- const {
+SDValue
+SITargetLowering::performSignExtendInRegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SDValue Src = N->getOperand(0);
auto *VTSign = cast<VTSDNode>(N->getOperand(1));
- if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
- VTSign->getVT() == MVT::i8) ||
- (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
- VTSign->getVT() == MVT::i16)) &&
- Src.hasOneUse()) {
+ // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
+ // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
+ if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16))) {
+ assert(Subtarget->hasScalarSubwordLoads() &&
+ "s_buffer_load_{u8, i8} are supported "
+ "in GFX12 (or newer) architectures.");
+ EVT VT = Src.getValueType();
+ unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
+ ? AMDGPUISD::SBUFFER_LOAD_BYTE
+ : AMDGPUISD::SBUFFER_LOAD_SHORT;
+ SDLoc DL(N);
+ SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
+ SDValue Ops[] = {
+ Src.getOperand(0), // source register
+ Src.getOperand(1), // offset
+ Src.getOperand(2) // cachePolicy
+ };
+ auto *M = cast<MemSDNode>(Src);
+ SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
+ Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
+ SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLo...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/77633
More information about the llvm-commits
mailing list