[llvm] ec8ede8 - [AMDGPU][CodeGen] Support raw format TFE buffer loads other than byte, short and d16 ones.
Ivan Kosarev via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 24 03:19:27 PST 2022
Author: Ivan Kosarev
Date: 2022-11-24T10:50:26Z
New Revision: ec8ede817776347d568580b4ec26b8403bcce2a9
URL: https://github.com/llvm/llvm-project/commit/ec8ede817776347d568580b4ec26b8403bcce2a9
DIFF: https://github.com/llvm/llvm-project/commit/ec8ede817776347d568580b4ec26b8403bcce2a9.diff
LOG: [AMDGPU][CodeGen] Support raw format TFE buffer loads other than byte, short and d16 ones.
Differential Revision: https://reviews.llvm.org/D138215
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/BUFInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index c2b084bc0779d..787d097c2ae32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -218,6 +218,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_TFE, SIbuffer_load_format_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT, SItbuffer_load>;
def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT_D16, SItbuffer_load_d16>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 842c872d409fc..9c039da76b417 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4429,6 +4429,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_BYTE)
NODE_NAME_CASE(BUFFER_LOAD_SHORT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(SBUFFER_LOAD)
NODE_NAME_CASE(BUFFER_STORE)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 619f9aff46a15..ef8539f8ccc1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -514,6 +514,7 @@ enum NodeType : unsigned {
BUFFER_LOAD_BYTE,
BUFFER_LOAD_SHORT,
BUFFER_LOAD_FORMAT,
+ BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
SBUFFER_LOAD,
BUFFER_STORE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a6088cd908502..b5b13ef21a731 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4466,6 +4466,27 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
return true;
}
+static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
+ Register VIndex, Register VOffset, Register SOffset,
+ unsigned ImmOffset, unsigned Format,
+ unsigned AuxiliaryData, MachineMemOperand *MMO,
+ bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
+ auto MIB = B.buildInstr(Opc)
+ .addDef(LoadDstReg) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset); // offset(imm)
+
+ if (IsTyped)
+ MIB.addImm(Format);
+
+ MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
+ .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+ .addMemOperand(MMO);
+}
+
bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
@@ -4477,18 +4498,27 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
- Register RSrc = MI.getOperand(2).getReg();
+
+ Register StatusDst;
+ int OpOffset = 0;
+ assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
+ bool IsTFE = MI.getNumExplicitDefs() == 2;
+ if (IsTFE) {
+ StatusDst = MI.getOperand(1).getReg();
+ ++OpOffset;
+ }
+
+ Register RSrc = MI.getOperand(2 + OpOffset).getReg();
// The typed intrinsics add an immediate after the registers.
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
// The struct intrinsic variants add one additional operand over raw.
- const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+ const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
Register VIndex;
- int OpOffset = 0;
if (HasVIndex) {
- VIndex = MI.getOperand(3).getReg();
- OpOffset = 1;
+ VIndex = MI.getOperand(3 + OpOffset).getReg();
+ ++OpOffset;
} else {
VIndex = B.buildConstant(S32, 0).getReg(0);
}
@@ -4515,13 +4545,21 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
unsigned Opc;
+ // TODO: Support TFE for typed and narrow loads.
if (IsTyped) {
+ assert(!IsTFE);
Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
} else if (IsFormat) {
- Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
- AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
+ if (IsD16) {
+ assert(!IsTFE);
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
+ } else {
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
+ }
} else {
+ assert(!IsTFE);
switch (MemTy.getSizeInBits()) {
case 8:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
@@ -4535,49 +4573,46 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
}
}
- Register LoadDstReg;
-
- bool IsExtLoad =
- (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
- LLT UnpackedTy = Ty.changeElementSize(32);
-
- if (IsExtLoad)
- LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
- else if (Unpacked && IsD16 && Ty.isVector())
- LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
- else
- LoadDstReg = Dst;
-
- auto MIB = B.buildInstr(Opc)
- .addDef(LoadDstReg) // vdata
- .addUse(RSrc) // rsrc
- .addUse(VIndex) // vindex
- .addUse(VOffset) // voffset
- .addUse(SOffset) // soffset
- .addImm(ImmOffset); // offset(imm)
-
- if (IsTyped)
- MIB.addImm(Format);
-
- MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
- .addImm(HasVIndex ? -1 : 0) // idxen(imm)
- .addMemOperand(MMO);
-
- if (LoadDstReg != Dst) {
- B.setInsertPt(B.getMBB(), ++B.getInsertPt());
-
- // Widen result for extending loads was widened.
- if (IsExtLoad)
- B.buildTrunc(Dst, LoadDstReg);
- else {
- // Repack to original 16-bit vector result
- // FIXME: G_TRUNC should work, but legalization currently fails
- auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
- SmallVector<Register, 4> Repack;
- for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
- Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
- B.buildMerge(Dst, Repack);
+ if (IsTFE) {
+ unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
+ unsigned NumLoadDWords = NumValueDWords + 1;
+ LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
+ Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
+ buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
+ Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
+ if (NumValueDWords == 1) {
+ B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
+ } else {
+ SmallVector<Register, 5> LoadElts;
+ for (unsigned I = 0; I != NumValueDWords; ++I)
+ LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
+ LoadElts.push_back(StatusDst);
+ B.buildUnmerge(LoadElts, LoadDstReg);
+ LoadElts.truncate(NumValueDWords);
+ B.buildMerge(Dst, LoadElts);
}
+ } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
+ (IsD16 && !Ty.isVector())) {
+ Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
+ buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
+ Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ B.buildTrunc(Dst, LoadDstReg);
+ } else if (Unpacked && IsD16 && Ty.isVector()) {
+ LLT UnpackedTy = Ty.changeElementSize(32);
+ Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
+ buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
+ Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ // FIXME: G_TRUNC should work, but legalization currently fails
+ auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
+ SmallVector<Register, 4> Repack;
+ for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
+ Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
+ B.buildMerge(Dst, Repack);
+ } else {
+ buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
+ AuxiliaryData, MMO, IsTyped, HasVIndex, B);
}
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 3e0d60100cc1d..c5372559a5ded 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2877,6 +2877,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
@@ -4046,6 +4047,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 9056359f11730..5a41cc6043258 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1299,6 +1299,11 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3i32, "BUFFER_LOAD_FORMAT_X
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v2i32, "BUFFER_LOAD_FORMAT_X_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v3i32, "BUFFER_LOAD_FORMAT_XY_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v4i32, "BUFFER_LOAD_FORMAT_XYZ_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v5i32, "BUFFER_LOAD_FORMAT_XYZW_TFE">;
+
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c40a1b7065322..510d99dee014e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -919,11 +919,11 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
-static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
- assert(DMaskLanes != 0);
+static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+ assert(MaxNumLanes != 0);
if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
- unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
+ unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
return EVT::getVectorVT(Ty->getContext(),
EVT::getEVT(VT->getElementType()),
NumElts);
@@ -933,19 +933,15 @@ static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
}
// Peek through TFE struct returns to only use the data size.
-static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
+static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
auto *ST = dyn_cast<StructType>(Ty);
if (!ST)
- return memVTFromImageData(Ty, DMaskLanes);
+ return memVTFromLoadIntrData(Ty, MaxNumLanes);
- // Some intrinsics return an aggregate type - special case to work out the
- // correct memVT.
- //
- // Only limited forms of aggregate type currently expected.
- if (ST->getNumContainedTypes() != 2 ||
- !ST->getContainedType(1)->isIntegerTy(32))
- return EVT();
- return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
+ // TFE intrinsics return an aggregate type.
+ assert(ST->getNumContainedTypes() == 2 &&
+ ST->getContainedType(1)->isIntegerTy(32));
+ return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
}
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -978,7 +974,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
- unsigned DMaskLanes = 4;
+ unsigned MaxNumLanes = 4;
if (RsrcIntr->IsImage) {
const AMDGPU::ImageDimIntrinsicInfo *Intr
@@ -991,12 +987,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// IR type. Check the dmask for the real number of elements loaded.
unsigned DMask
= cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
- DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
+ MaxNumLanes = DMask == 0 ? 1 : countPopulation(DMask);
}
+ }
- Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
- } else
- Info.memVT = EVT::getEVT(CI.getType());
+ Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
// FIXME: What does alignment mean for an image?
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1008,7 +1003,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (RsrcIntr->IsImage) {
unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
- Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
+ Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
} else
Info.memVT = EVT::getEVT(DataTy);
@@ -4854,8 +4849,18 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- unsigned Opc =
- IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
+ assert(M->getNumValues() == 2 || M->getNumValues() == 3);
+ bool IsTFE = M->getNumValues() == 3;
+
+ unsigned Opc;
+ if (IsFormat) {
+ Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
+ : AMDGPUISD::BUFFER_LOAD_FORMAT;
+ } else {
+ // TODO: Support non-format TFE loads.
+ assert(!IsTFE);
+ Opc = AMDGPUISD::BUFFER_LOAD;
+ }
if (IsD16) {
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
@@ -7850,35 +7855,54 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
-// dwordx4 if on SI.
+// dwordx4 if on SI and handle TFE loads.
SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO,
SelectionDAG &DAG) const {
+ LLVMContext &C = *DAG.getContext();
+ MachineFunction &MF = DAG.getMachineFunction();
EVT VT = VTList.VTs[0];
- EVT WidenedVT = VT;
- EVT WidenedMemVT = MemVT;
- if (!Subtarget->hasDwordx3LoadStores() &&
- (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
- WidenedVT = EVT::getVectorVT(*DAG.getContext(),
- WidenedVT.getVectorElementType(), 4);
- WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
- WidenedMemVT.getVectorElementType(), 4);
- MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
- }
- assert(VTList.NumVTs == 2);
- SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
-
- auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
- WidenedMemVT, MMO);
- if (WidenedVT != VT) {
- auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
- DAG.getVectorIdxConstant(0, DL));
- NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
+ assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
+ bool IsTFE = VTList.NumVTs == 3;
+ if (IsTFE) {
+ unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
+ unsigned NumOpDWords = NumValueDWords + 1;
+ EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
+ SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
+ MachineMemOperand *OpDWordsMMO =
+ MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
+ SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
+ OpDWordsVT, OpDWordsMMO, DAG);
+ SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(NumValueDWords, DL));
+ SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+ SDValue ValueDWords =
+ NumValueDWords == 1
+ ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
+ : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
+ EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
+ ZeroIdx);
+ SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
+ return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
}
- return NewOp;
+
+ if (!Subtarget->hasDwordx3LoadStores() &&
+ (VT == MVT::v3i32 || VT == MVT::v3f32)) {
+ EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
+ EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
+ MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
+ SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
+ SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
+ WidenedMemVT, WidenedMMO);
+ SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,
+ DAG.getVectorIdxConstant(0, DL));
+ return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
+ }
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
}
SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index dfbf82038ec15..950fa70fa43fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -135,6 +135,8 @@ def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1e25c359981c..0516547e179c3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3233,6 +3233,7 @@ def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
index 6b7ad3fbb4178..c13ff4c2cb2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
@@ -214,10 +214,125 @@ define amdgpu_ps float @struct_buffer_load_format_i32__sgpr_rsrc__vgpr_vindex__v
ret float %fval
}
+define amdgpu_cs void @struct_buffer_load_format_v4i32_tfe(<4 x i32> inreg %rsrc, <4 x i32> addrspace(1)* %value, i32 addrspace(1)* %status) {
+ ; CHECK-LABEL: name: struct_buffer_load_format_v4i32_tfe
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from custom "BufferResource", align 1, addrspace 4)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; CHECK-NEXT: FLAT_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into %ir.value, addrspace 1)
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+
+ %v = extractvalue { <4 x i32>, i32 } %load, 0
+ store <4 x i32> %v, <4 x i32> addrspace(1)* %value
+
+ %s = extractvalue { <4 x i32>, i32 } %load, 1
+ store i32 %s, i32 addrspace(1)* %status
+
+ ret void
+}
+
+define amdgpu_cs void @struct_buffer_load_format_v3i32_tfe(<4 x i32> inreg %rsrc, <3 x i32> addrspace(1)* %value, i32 addrspace(1)* %status) {
+ ; CHECK-LABEL: name: struct_buffer_load_format_v3i32_tfe
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from custom "BufferResource", align 1, addrspace 4)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; CHECK-NEXT: FLAT_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>) into %ir.value, align 16, addrspace 1)
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+
+ %v = extractvalue { <3 x i32>, i32 } %load, 0
+ store <3 x i32> %v, <3 x i32> addrspace(1)* %value
+
+ %s = extractvalue { <3 x i32>, i32 } %load, 1
+ store i32 %s, i32 addrspace(1)* %status
+
+ ret void
+}
+
+define amdgpu_cs void @struct_buffer_load_format_i32_tfe(<4 x i32> inreg %rsrc, i32 addrspace(1)* %value, i32 addrspace(1)* %status) {
+ ; CHECK-LABEL: name: struct_buffer_load_format_i32_tfe
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1)
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+
+ %v = extractvalue { i32, i32 } %load, 0
+ store i32 %v, i32 addrspace(1)* %value
+
+ %s = extractvalue { i32, i32 } %load, 1
+ store i32 %s, i32 addrspace(1)* %status
+
+ ret void
+}
+
declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #0
declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32 immarg) #0
declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32 immarg) #0
declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #0
declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index e0a9423f5d5f2..4e89591d5f2e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
@@ -1,5 +1,6 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=CHECK,GFX6 %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,GFX8PLUS %s
+;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=CHECK,GFX8PLUS %s
;CHECK-LABEL: {{^}}buffer_load:
;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
@@ -118,9 +119,115 @@ main_body:
ret <2 x float> %data
}
+;CHECK-LABEL: {{^}}buffer_load_v4i32_tfe:
+;CHECK: buffer_load_format_xyzw v[2:6], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, <4 x i32> addrspace(1)* %out) {
+ %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { <4 x i32>, i32 } %load, 0
+ store <4 x i32> %data, <4 x i32> addrspace(1)* %out
+ %status = extractvalue { <4 x i32>, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_v4f32_tfe:
+;CHECK: buffer_load_format_xyzw v[2:6], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, <4 x float> addrspace(1)* %out) {
+ %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { <4 x float>, i32 } %load, 0
+ store <4 x float> %data, <4 x float> addrspace(1)* %out
+ %status = extractvalue { <4 x float>, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_v3i32_tfe:
+;CHECK: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, <3 x i32> addrspace(1)* %out) {
+ %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { <3 x i32>, i32 } %load, 0
+ store <3 x i32> %data, <3 x i32> addrspace(1)* %out
+ %status = extractvalue { <3 x i32>, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_v3f32_tfe:
+;CHECK: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, <3 x float> addrspace(1)* %out) {
+ %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { <3 x float>, i32 } %load, 0
+ store <3 x float> %data, <3 x float> addrspace(1)* %out
+ %status = extractvalue { <3 x float>, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_v2i32_tfe:
+;GFX6: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;GFX8PLUS: buffer_load_format_xy v[2:4], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, <2 x i32> addrspace(1)* %out) {
+ %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { <2 x i32>, i32 } %load, 0
+ store <2 x i32> %data, <2 x i32> addrspace(1)* %out
+ %status = extractvalue { <2 x i32>, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_v2f32_tfe:
+;GFX6: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;GFX8PLUS: buffer_load_format_xy v[2:4], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, <2 x float> addrspace(1)* %out) {
+ %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { <2 x float>, i32 } %load, 0
+ store <2 x float> %data, <2 x float> addrspace(1)* %out
+ %status = extractvalue { <2 x float>, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_i32_tfe:
+;CHECK: buffer_load_format_x v[2:3], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, i32 addrspace(1)* %out) {
+ %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { i32, i32 } %load, 0
+ store i32 %data, i32 addrspace(1)* %out
+ %status = extractvalue { i32, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
+;CHECK-LABEL: {{^}}buffer_load_f32_tfe:
+;CHECK: buffer_load_format_x v[2:3], {{v[0-9]+}}, s[0:3], 0 idxen tfe
+;CHECK: s_waitcnt
+define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, float addrspace(1)* %out) {
+ %load = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %data = extractvalue { float, i32 } %load, 0
+ store float %data, float addrspace(1)* %out
+ %status = extractvalue { float, i32 } %load, 1
+ %fstatus = bitcast i32 %status to float
+ ret float %fstatus
+}
+
declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32) #0
+declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
+declare { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0
attributes #0 = { nounwind readonly }
More information about the llvm-commits
mailing list