[libcxx-commits] [flang] [clang] [lldb] [libcxxabi] [lld] [compiler-rt] [clang-tools-extra] [llvm] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
Mariusz Sikora via libcxx-commits
libcxx-commits at lists.llvm.org
Fri Dec 15 04:53:13 PST 2023
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/74576
>From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 28 Jun 2022 15:24:24 -0700
Subject: [PATCH 1/4] [AMDGPU] GFX12: select @llvm.prefetch intrinsic
---
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 21 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 +
llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 +
llvm/lib/Target/AMDGPU/SMInstructions.td | 34 ++
llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++++++++++++++++++
8 files changed, 591 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79f..f0b3ed7adc294c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 8); // M0
return;
+ case Intrinsic::prefetch: {
+ if (!Subtarget.hasPrefetch()) {
+ MI.eraseFromParent();
+ return;
+ }
+ unsigned PtrBank =
+ getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ if (PtrBank == AMDGPU::VGPRRegBankID) {
+ MI.eraseFromParent();
+ return;
+ }
+ // FIXME: There is currently no support for prefetch in global isel.
+ // There is no node equivalence and what's worse there is no MMO produced
+ // for a prefetch on global isel path.
+ // Prefetch does not affect execution so erase it for now.
+ MI.eraseFromParent();
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
break;
}
+ case Intrinsic::prefetch:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
default:
return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6f..21a9b8147034fc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
+ bool hasPrefetch() const { return GFX12Insts; }
+
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspective of an arbitrary workitem, this
// is 4-byte aligned.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..93af38d877c5d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
+ if (Subtarget->hasPrefetch())
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
}
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+ if (Op->isDivergent())
+ return SDValue();
+
+ switch (cast<MemSDNode>(Op)->getAddressSpace()) {
+ case AMDGPUAS::FLAT_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ break;
+ default:
+ return SDValue();
+ }
+
+ return Op;
+}
+
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
@@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTACKSAVE(Op, DAG);
case ISD::GET_ROUNDING:
return lowerGET_ROUNDING(Op, DAG);
+ case ISD::PREFETCH:
+ return lowerPREFETCH(Op, DAG);
}
return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c9cc149218a997..5bc091d6e84de3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -416,6 +416,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d6912544..8e96d5f8abe151 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -483,6 +483,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
Offset = OffsetOp ? OffsetOp->getImm() : 0;
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+ if (DataOpIdx == -1)
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9362fe5d9678b4..6d513fb0bdecc9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3164,6 +3164,18 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;
+class SMPrefetchGetPcPat<string type, int cache_type> : GCNPat <
+ (prefetch (i64 imm:$offset), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) (S_ADD_U64_PSEUDO (S_GETPC_B64), $offset),
+ (i32 20), (i32 SGPR_NULL), (i8 0))
+ // Offset 20 should roughly adjust getpc sequence length.
+ > {
+ let AddedComplexity = 9;
+}
+
+def : SMPrefetchGetPcPat<"INST", 0>;
+def : SMPrefetchGetPcPat<"DATA", 1>;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index c18846483cf95a..a77856caae7a65 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -814,6 +814,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
}];
}
+def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return !N->getOperand(1)->isDivergent();}]> {
+ let GISelPredicateCode = [{
+ return isInstrUniform(MI);
+ }];
+}
+
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
@@ -959,6 +967,32 @@ def : GCNPat <
}
} // let OtherPredicates = [HasShaderCyclesRegister]
+def SIMM24bitPtr : ImmLeaf <iPTR,
+ [{return isInt<24>(Imm);}]
+>;
+
+multiclass SMPrefetchPat<string type, int cache_type> {
+ def : GCNPat <
+ (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
+ >;
+
+ def : GCNPat <
+ (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
+ >;
+
+ def : GCNPat <
+ (prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0))
+ > {
+ let AddedComplexity = 10;
+ }
+}
+
+defm : SMPrefetchPat<"INST", 0>;
+defm : SMPrefetchPat<"DATA", 1>;
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
new file mode 100644
index 00000000000000..bca76770953b97
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-SDAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+
+; Scalar data prefetch
+
+define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x200, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x200, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr addrspace(4) null, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check large offsets
+
+define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_max_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], -0x800000, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_min_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_max_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x7fffff, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_min_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel -0x800000, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x14, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check divergent address
+
+define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) {
+; GCN-LABEL: prefetch_data_vgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check LDS and Scratch, we cannot prefetch it
+
+define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) {
+; GCN-LABEL: prefetch_data_lds:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p3(ptr addrspace(3) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_scratch(ptr addrspace(5) inreg %ptr) {
+; GCN-LABEL: prefetch_data_scratch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p5(ptr addrspace(5) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check supported address spaces
+
+define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_flat:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_flat:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_flat:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_global:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_global:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_global:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; I$ prefetch
+
+define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel_offset() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x80, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+; Check large offsets
+
+define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], -0x800000, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel_max_offset() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_max_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x7fffff, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel_max_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_max_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel_min_offset() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_min_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel -0x800000, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel_min_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_min_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel_too_large_offset() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_too_large_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x14, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel_too_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_too_large_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+declare void @llvm.prefetch.pf(ptr nocapture readonly, i32, i32, i32)
+declare void @llvm.prefetch.p1(ptr addrspace(1) nocapture readonly, i32, i32, i32)
+declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i32)
+declare void @llvm.prefetch.p4(ptr addrspace(4) nocapture readonly, i32, i32, i32)
+declare void @llvm.prefetch.p5(ptr addrspace(5) nocapture readonly, i32, i32, i32)
+declare void @llvm.prefetch.p6(ptr addrspace(6) nocapture readonly, i32, i32, i32)
>From 3d56730e6607ee9e463188b0b936ef703ad85feb Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Thu, 7 Dec 2023 14:18:59 +0100
Subject: [PATCH 2/4] Stop generating _PC_REL form
---
llvm/lib/Target/AMDGPU/SMInstructions.td | 11 --
llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 150 ----------------------
2 files changed, 161 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index a77856caae7a65..fb47ee62930fae 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -967,10 +967,6 @@ def : GCNPat <
}
} // let OtherPredicates = [HasShaderCyclesRegister]
-def SIMM24bitPtr : ImmLeaf <iPTR,
- [{return isInt<24>(Imm);}]
->;
-
multiclass SMPrefetchPat<string type, int cache_type> {
def : GCNPat <
(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
@@ -981,13 +977,6 @@ multiclass SMPrefetchPat<string type, int cache_type> {
(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
(!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
>;
-
- def : GCNPat <
- (prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
- (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0))
- > {
- let AddedComplexity = 10;
- }
}
defm : SMPrefetchPat<"INST", 0>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index bca76770953b97..d5dcfef91923e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -43,43 +43,6 @@ entry:
ret void
}
-define amdgpu_ps void @prefetch_data_pc_rel() {
-; GFX12-SDAG-LABEL: prefetch_data_pc_rel:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x0, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_data_pc_rel:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_pc_rel:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 1)
- ret void
-}
-
-define amdgpu_ps void @prefetch_data_pc_rel_offset() {
-; GFX12-SDAG-LABEL: prefetch_data_pc_rel_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x200, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_data_pc_rel_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_pc_rel_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr float, ptr addrspace(4) null, i32 128
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
- ret void
-}
-
; Check large offsets
define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
@@ -141,44 +104,6 @@ entry:
ret void
}
-define amdgpu_ps void @prefetch_data_pc_rel_max_offset() {
-; GFX12-SDAG-LABEL: prefetch_data_pc_rel_max_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x7fffff, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_data_pc_rel_max_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_pc_rel_max_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
- ret void
-}
-
-define amdgpu_ps void @prefetch_data_pc_rel_min_offset() {
-; GFX12-SDAG-LABEL: prefetch_data_pc_rel_min_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel -0x800000, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_data_pc_rel_min_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_pc_rel_min_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
- ret void
-}
-
define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() {
; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset:
; GFX12-SDAG: ; %bb.0: ; %entry
@@ -329,43 +254,6 @@ entry:
ret void
}
-define amdgpu_ps void @prefetch_inst_pc_rel() {
-; GFX12-SDAG-LABEL: prefetch_inst_pc_rel:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x0, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_inst_pc_rel:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_pc_rel:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 0)
- ret void
-}
-
-define amdgpu_ps void @prefetch_inst_pc_rel_offset() {
-; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x80, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_inst_pc_rel_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 128
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
- ret void
-}
-
; Check large offsets
define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
@@ -427,44 +315,6 @@ entry:
ret void
}
-define amdgpu_ps void @prefetch_inst_pc_rel_max_offset() {
-; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_max_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x7fffff, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_inst_pc_rel_max_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_max_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
- ret void
-}
-
-define amdgpu_ps void @prefetch_inst_pc_rel_min_offset() {
-; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_min_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel -0x800000, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_inst_pc_rel_min_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_min_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
- ret void
-}
-
define amdgpu_ps void @prefetch_inst_pc_rel_too_large_offset() {
; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_too_large_offset:
; GFX12-SDAG: ; %bb.0: ; %entry
>From a4366be2e0fe2fa1e7740cbc29a9f372841b4abf Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Fri, 15 Dec 2023 12:41:04 +0100
Subject: [PATCH 3/4] Remove PrefetchGetPcPat
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 ------
llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 46 -----------------------
2 files changed, 58 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6d513fb0bdecc9..9362fe5d9678b4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3164,18 +3164,6 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;
-class SMPrefetchGetPcPat<string type, int cache_type> : GCNPat <
- (prefetch (i64 imm:$offset), timm, timm, (i32 cache_type)),
- (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) (S_ADD_U64_PSEUDO (S_GETPC_B64), $offset),
- (i32 20), (i32 SGPR_NULL), (i8 0))
- // Offset 20 should roughly adjust getpc sequence length.
- > {
- let AddedComplexity = 9;
-}
-
-def : SMPrefetchGetPcPat<"INST", 0>;
-def : SMPrefetchGetPcPat<"DATA", 1>;
-
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index d5dcfef91923e2..0df838bc5c8101 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -104,29 +104,6 @@ entry:
ret void
}
-define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() {
-; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
-; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
-; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x14, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_data_pc_rel_too_large_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_data_pc_rel_too_large_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
- ret void
-}
-
; Check divergent address
define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) {
@@ -315,29 +292,6 @@ entry:
ret void
}
-define amdgpu_ps void @prefetch_inst_pc_rel_too_large_offset() {
-; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_too_large_offset:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
-; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
-; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x14, null, 0
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX11-LABEL: prefetch_inst_pc_rel_too_large_offset:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_too_large_offset:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608
- tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
- ret void
-}
-
declare void @llvm.prefetch.pf(ptr nocapture readonly, i32, i32, i32)
declare void @llvm.prefetch.p1(ptr addrspace(1) nocapture readonly, i32, i32, i32)
declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i32)
>From 0598ae2c678b6d23fbd2a1acb8256e2d50db6f68 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Fri, 15 Dec 2023 13:37:39 +0100
Subject: [PATCH 4/4] G_PREFETCH part
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 ++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 21 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 6 ++----
3 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8de8d8f68ad45b..1cfe5ce31dbaff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1987,6 +1987,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
.unsupported();
+ getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b47fafb273442d..d4824f964d4b82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3258,6 +3258,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_MAD_I64_I32:
applyMappingMAD_64_32(B, OpdMapper);
return;
+ case AMDGPU::G_PREFETCH: {
+ if (!Subtarget.hasPrefetch()) {
+ MI.eraseFromParent();
+ return;
+ }
+ unsigned PtrBank =
+ getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ if (PtrBank == AMDGPU::VGPRRegBankID) {
+ MI.eraseFromParent();
+ return;
+ }
+ // FIXME: There is currently no support for prefetch in global isel.
+ // There is no node equivalence and what's worse there is no MMO produced
+ // for a prefetch on global isel path.
+ // Prefetch does not affect execution so erase it for now.
+ MI.eraseFromParent();
+ return;
+ }
default:
break;
}
@@ -5012,6 +5030,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_PREFETCH:
+ OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ break;
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index 0df838bc5c8101..c287789f8f4938 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -86,8 +86,7 @@ entry:
define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
; GFX12-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
-; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000
; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -274,8 +273,7 @@ entry:
define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
; GFX12-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
-; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000
; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; GFX12-SDAG-NEXT: s_endpgm
;
More information about the libcxx-commits
mailing list