[llvm] [AMDGPU] CodeGen for SMEM instructions (PR #75579)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 15 01:24:31 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Mirko BrkuĊĦanin (mbrkusanin)
<details>
<summary>Changes</summary>
---
Patch is 808.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75579.diff
27 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+12)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+4-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+63-25)
- (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+37-2)
- (modified) llvm/lib/Target/AMDGPU/SMInstructions.td (+10-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir (+161-86)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll (+1650)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll (+1506)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir (+414-350)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir (+36-23)
- (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll (+131-6)
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+3-9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+34-58)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+273)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f32.ll (+26)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f64.ll (+39)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+2018)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+1329)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+647)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (+127)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+1808)
- (modified) llvm/test/CodeGen/AMDGPU/merge-s-load.mir (+135-27)
- (modified) llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir (+70-2)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+1-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 920cf784858768..d1cafd283d198d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -822,6 +822,12 @@ def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
"Has single-use VGPR hint instructions"
>;
+def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
+ "HasRestrictedSOffset",
+ "true",
+ "Has restricted SOffset (immediate not supported)."
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1467,6 +1473,7 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
+ FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
@@ -1780,6 +1787,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
+def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
+ AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
+def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
+ AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
+
def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 489b4f5a8d86a5..f3a59109b48219 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -460,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
return false;
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
- // end up widening these for a scalar load during RegBankSelect, since there
- // aren't 96-bit scalar loads.
+ // end up widening these for a scalar load during RegBankSelect, if we don't
+ // have 96-bit scalar loads.
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;
@@ -6467,10 +6467,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
- // There are no 96-bit result scalar loads, but widening to 128-bit should
+ // If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
- if (!isPowerOf2_32(Size)) {
+ if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d0c1302c3f003c..80d67836fda8f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
if (DstBank == &AMDGPU::SGPRRegBank) {
// There are some special cases that we need to look at for 32 bit and 96
// bit SGPR loads otherwise we have nothing to do.
- if (LoadSize != 32 && LoadSize != 96)
+ if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a316d608bf573d..bb9c92fd43f162 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -198,6 +198,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool ScalarizeGlobal = false;
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
+ bool HasRestrictedSOffset = false;
bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
@@ -1160,6 +1161,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
+ bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 708f212e204acf..b9fc2617e6eb5e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1438,11 +1438,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
- } else {
+ } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
// for S_BUFFER_* instructions).
if (!isInt<21>(AM.BaseOffs))
return false;
+ } else {
+ // On GFX12, all offsets are signed 24-bit in bytes.
+ if (!isInt<24>(AM.BaseOffs))
+ return false;
}
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
@@ -7497,7 +7501,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
};
// Widen vec3 load to vec4.
- if (VT.isVector() && VT.getVectorNumElements() == 3) {
+ if (VT.isVector() && VT.getVectorNumElements() == 3 &&
+ !Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
@@ -7913,6 +7918,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// On targets not supporting constant in soffset field, turn zero to
+// SGPR_NULL to avoid generating an extra s_mov with zero.
+static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
+ const GCNSubtarget *Subtarget) {
+ if (Subtarget->hasRestrictedSOffset())
+ if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
+ if (SOffsetConst->isZero()) {
+ return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
+ }
+ }
+ return SOffset;
+}
+
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -7921,13 +7939,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7954,13 +7973,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8116,12 +8136,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8140,12 +8161,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8157,21 +8179,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- Op.getOperand(5), // soffset
- Op.getOperand(6), // offset
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // voffset
+ SOffset, // soffset
+ Op.getOperand(6), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -8187,13 +8210,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy, swizzled buffer
@@ -8213,13 +8237,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8432,6 +8457,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8439,7 +8465,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8454,6 +8480,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8461,7 +8488,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
Op.getOperand(5), // vindex
Offsets.first, // voffset
- Op.getOperand(7), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8893,13 +8920,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
@@ -8920,13 +8948,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -9000,13 +9029,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -9050,13 +9080,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -9404,8 +9435,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
return;
}
}
+
+ SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
+ ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
+ : DAG.getConstant(0, DL, MVT::i32);
+
Offsets[0] = CombinedOffset;
- Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[1] = SOffsetZero;
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}
@@ -9663,7 +9699,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
@@ -9679,7 +9716,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->getScalarizeGlobalBehavior() && !O...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75579
More information about the llvm-commits
mailing list