[clang] bd840a4 - [AMDGPU] Add target intrinsic for s_prefetch_data (#107133)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Sep 5 15:14:35 PDT 2024
Author: Stanislav Mekhanoshin
Date: 2024-09-05T15:14:31-07:00
New Revision: bd840a40042c2c67f56079493d0bcdbfc70325ba
URL: https://github.com/llvm/llvm-project/commit/bd840a40042c2c67f56079493d0bcdbfc70325ba
DIFF: https://github.com/llvm/llvm-project/commit/bd840a40042c2c67f56079493d0bcdbfc70325ba.diff
LOG: [AMDGPU] Add target intrinsic for s_prefetch_data (#107133)
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SMInstructions.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ab29ef38f7792f..5060647d357641 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 02d8726baa4210..da7a1a55da5313 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19608,6 +19608,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
}
+ case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
+ return emitBuiltinWithOneOverloadedType<2>(
+ *this, E, Intrinsic::amdgcn_s_prefetch_data);
default:
return nullptr;
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index d9ec258e644c9d..34ee44afe0f104 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -256,4 +256,28 @@ void test_s_ttracedata_imm()
__builtin_amdgcn_s_ttracedata_imm(1);
}
-
+// CHECK-LABEL: @test_s_prefetch_data(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
+// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
+// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
+// CHECK-NEXT: ret void
+//
+void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len)
+{
+ __builtin_amdgcn_s_prefetch_data(fp, 0);
+ __builtin_amdgcn_s_prefetch_data(gp, len);
+ __builtin_amdgcn_s_prefetch_data(cp, 31);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index dc13a35c66f9ab..a5259ba9eec36e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2689,6 +2689,14 @@ def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_wave_id :
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
+def int_amdgcn_s_prefetch_data :
+ Intrinsic<[],
+ [llvm_anyptr_ty, // Pointer to a constant/global memory
+ llvm_i32_ty], // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes)
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3fcb364fc2c536..9bebd418bb426e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5541,7 +5541,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
- MIB.addImm(MI.getOperand(OpIdx).getImm());
+ const MachineOperand &Op = MI.getOperand(OpIdx);
+ int64_t Imm;
+ if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
+ MIB.addImm(Imm);
+ else
+ MIB.addImm(Op.getImm());
}
void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4737a322c255f4..a2e6842b760f65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3290,6 +3290,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 2);
return;
}
+ case Intrinsic::amdgcn_s_prefetch_data: {
+ Register PtrReg = MI.getOperand(1).getReg();
+ unsigned AS = MRI.getType(PtrReg).getAddressSpace();
+ if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ } else
+ MI.eraseFromParent();
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5151,6 +5161,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
+ case Intrinsic::amdgcn_s_prefetch_data: {
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81b52935ddf397..accc3084217f2b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1430,6 +1430,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_s_prefetch_data: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
default:
return false;
}
@@ -9921,6 +9928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
+ case Intrinsic::amdgcn_s_prefetch_data: {
+ // For non-global address space preserve the chain and remove the call.
+ if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
+ return Op.getOperand(0);
+ return Op;
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 844f62abc26717..90e11df500bc9b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6231,7 +6231,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
SBase->setReg(SGPR);
}
MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
- if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+ if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
SOff->setReg(SGPR);
}
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 9fc570bb85f24e..e7db4f49d9e549 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1152,6 +1152,28 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
defm : SMPrefetchPat<"INST", i32imm_zero>;
defm : SMPrefetchPat<"DATA", i32imm_one>;
+let SubtargetPredicate = isGFX12Plus in {
+ def : GCNPat <
+ (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), (i32 SReg_32:$len)),
+ (S_PREFETCH_DATA $sbase, $offset, $len, 0)
+ >;
+
+ def : GCNPat <
+ (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), (i32 SReg_32:$len)),
+ (S_PREFETCH_DATA $sbase, 0, $len, 0)
+ >;
+
+ def : GCNPat <
+ (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), imm:$len),
+ (S_PREFETCH_DATA $sbase, $offset, (i32 SGPR_NULL), (as_i8timm $len))
+ >;
+
+ def : GCNPat <
+ (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len),
+ (S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len))
+ >;
+} // End let SubtargetPredicate = isGFX12Plus
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
new file mode 100644
index 00000000000000..54c39d78adb583
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
+; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
+; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_prefetch_data s[0:1], 0x200, s2, 0
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_prefetch_data s[0:1], 0x200, null, 31
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
+; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: v_readfirstlane_b32 s3, v1
+; GCN-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
+; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; SDAG-NEXT: s_prefetch_data s[2:3], 0x200, s0, 0
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0
+; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
+; GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) {
+; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 31)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_flat(ptr inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len_flat:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 31)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_local(ptr addrspace(3) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len_local:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p3(ptr addrspace(3) %ptr, i32 31)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_base_imm_len(ptr addrspace(4) %ptr) {
+; GCN-LABEL: prefetch_data_vgpr_base_imm_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len)
+declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len)
More information about the cfe-commits
mailing list