[llvm] [AMDGPU] Legalize and select raw/struct_buffer_load with tfe (PR #93310)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 24 07:58:38 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Mirko BrkuĊĦanin (mbrkusanin)
<details>
<summary>Changes</summary>
---
Patch is 335.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93310.diff
14 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+11-6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+10)
- (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+9)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+29-15)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+10)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+5)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll (+1515)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll (+1577)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll (+820)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll (+820)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 152f495a452ba..231db188e65dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -250,6 +250,11 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_TFE, SIbuffer_load_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT_TFE, SIbuffer_load_ushort_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, SIbuffer_load_ubyte_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT_TFE, SIbuffer_load_short_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE_TFE, SIbuffer_load_byte_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_TFE, SIbuffer_load_format_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 980e58510ceb7..375643b7f5197 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5529,6 +5529,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
NODE_NAME_CASE(BUFFER_LOAD_BYTE)
NODE_NAME_CASE(BUFFER_LOAD_SHORT)
+ NODE_NAME_CASE(BUFFER_LOAD_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3814b56a4d56a..71c4334029b43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -582,6 +582,11 @@ enum NodeType : unsigned {
BUFFER_LOAD_USHORT,
BUFFER_LOAD_BYTE,
BUFFER_LOAD_SHORT,
+ BUFFER_LOAD_TFE,
+ BUFFER_LOAD_UBYTE_TFE,
+ BUFFER_LOAD_USHORT_TFE,
+ BUFFER_LOAD_BYTE_TFE,
+ BUFFER_LOAD_SHORT_TFE,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a771b421e77a4..ee7fb20c23aa7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5870,17 +5870,18 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
}
} else {
- if (IsTFE)
- return false;
switch (MemTy.getSizeInBits()) {
case 8:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
case 16:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
default:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD;
break;
}
}
@@ -5892,7 +5893,11 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
- if (NumValueDWords == 1) {
+ if (MemTy.getSizeInBits() < 32) {
+ Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
+ B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
+ B.buildTrunc(Dst, ExtDst);
+ } else if (NumValueDWords == 1) {
B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
} else {
SmallVector<Register, 5> LoadElts;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dbb42a60f71fe..7ebd674757fbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3041,6 +3041,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
@@ -4323,6 +4328,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 8eaa113ac1816..1fbebc038c189 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1434,6 +1434,15 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v2i32, "BUFFER_LOAD_DWORD_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v3i32, "BUFFER_LOAD_DWORDX2_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v4i32, "BUFFER_LOAD_DWORDX3_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v5i32, "BUFFER_LOAD_DWORDX4_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte_tfe, v2i32, "BUFFER_LOAD_SBYTE_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short_tfe, v2i32, "BUFFER_LOAD_SSHORT_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte_tfe, v2i32, "BUFFER_LOAD_UBYTE_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort_tfe, v2i32, "BUFFER_LOAD_USHORT_TFE">;
+
multiclass MUBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1d2a5fff23568..b0f6e78709b1a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5962,16 +5962,10 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
assert(M->getNumValues() == 2 || M->getNumValues() == 3);
bool IsTFE = M->getNumValues() == 3;
- unsigned Opc;
- if (IsFormat) {
- Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
- : AMDGPUISD::BUFFER_LOAD_FORMAT;
- } else {
- // TODO: Support non-format TFE loads.
- if (IsTFE)
- return SDValue();
- Opc = AMDGPUISD::BUFFER_LOAD;
- }
+ unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
+ : AMDGPUISD::BUFFER_LOAD_FORMAT)
+ : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
+ : AMDGPUISD::BUFFER_LOAD;
if (IsD16) {
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
@@ -5979,7 +5973,8 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
+ IsTFE);
if (isTypeLegal(LoadVT)) {
return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
@@ -10172,11 +10167,30 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
}
// Handle 8 bit and 16 bit buffer loads
-SDValue
-SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
- SDLoc DL, ArrayRef<SDValue> Ops,
- MachineMemOperand *MMO) const {
+SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
+ EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO,
+ bool IsTFE) const {
EVT IntVT = LoadVT.changeTypeToInteger();
+
+ if (IsTFE) {
+ unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
+ ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
+ : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
+ SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
+ SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
+ SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(1, DL));
+ SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
+ SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
+ return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
+ }
+
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index fed73f48840fd..292b17da93583 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -275,7 +275,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
ArrayRef<SDValue> Ops,
- MachineMemOperand *MMO) const;
+ MachineMemOperand *MMO,
+ bool IsTFE = false) const;
// Handle 8 bit and 16 bit buffer stores
SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0ed2f60ea66a7..fd119e0992e56 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -148,6 +148,16 @@ def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ubyte_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ushort_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_byte_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_short_tfe: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_TFE", SDTBufferLoad,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e7aeaa017306c..d1667955f83db 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3762,6 +3762,11 @@ def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_UBYTE_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SBYTE_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_USHORT_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SSHORT_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
new file mode 100644
index 0000000000000..8eb05bb9565f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -0,0 +1,1515 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1100 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -global-isel -mcpu=gfx1200 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+
+define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
+ ; GFX67-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX67: bb.1 (%ir-block.0):
+ ; GFX67-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX67-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX67-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX67-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX67-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX67-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX67-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX67-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX67-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX67-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX67-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[BUFFER_LOAD_UBYTE_TFE_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX67-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub0
+ ; GFX67-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub1
+ ; GFX67-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; GFX67-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX67-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
+ ; GFX67-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY8]], [[REG_SEQUENCE1]], [[REG_SEQUENCE4]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX67-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; GFX67-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_3]], %subreg.sub0, [[S_MOV_B32_4]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX67-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_1]], %subreg.sub0_sub1, [[REG_SEQUENCE5]], %subreg.sub2_sub3
+ ; GFX67-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY9]], [[REG_SEQUENCE2]], [[REG_SEQUENCE6]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX67-NEXT: S_ENDPGM 0
+ ;
+ ; GFX8-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_TFE_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub0
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub1
+ ; GFX8-NEXT: FLAT_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX8-NEXT: S_ENDPGM 0
+ ;
+ ; GFX910-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX910: bb.1 (%ir-block.0):
+ ; GFX910-NEXT: liveins: $sgpr0, $...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/93310
More information about the llvm-commits
mailing list