[llvm] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 6 01:24:12 PST 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Mariusz Sikora (mariusz-sikora-at-amd)

<details>
<summary>Changes</summary>



---

Patch is 25.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74576.diff


8 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+21) 
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+22) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+12) 
- (modified) llvm/lib/Target/AMDGPU/SMInstructions.td (+34) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll (+496) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79..f0b3ed7adc294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 8); // M0
       return;
+    case Intrinsic::prefetch: {
+      if (!Subtarget.hasPrefetch()) {
+        MI.eraseFromParent();
+        return;
+      }
+      unsigned PtrBank =
+          getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+      if (PtrBank == AMDGPU::VGPRRegBankID) {
+        MI.eraseFromParent();
+        return;
+      }
+      // FIXME: There is currently no support for prefetch in global isel.
+      // There is no node equivalence and what's worse there is no MMO produced
+      // for a prefetch on global isel path.
+      // Prefetch does not affect execution so erase it for now.
+      MI.eraseFromParent();
+      return;
+    }
     default: {
       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
       break;
     }
+    case Intrinsic::prefetch:
+      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      break;
 
     default:
       return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6..21a9b8147034f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
 
+  bool hasPrefetch() const { return GFX12Insts; }
+
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspective of an arbitrary workitem, this
   // is 4-byte aligned.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7e..93af38d877c5d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMad64_32())
     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
 
+  if (Subtarget->hasPrefetch())
+    setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
                      {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
                       MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->isDivergent())
+    return SDValue();
+
+  switch (cast<MemSDNode>(Op)->getAddressSpace()) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+    break;
+  default:
+    return SDValue();
+  }
+
+  return Op;
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerSTACKSAVE(Op, DAG);
   case ISD::GET_ROUNDING:
     return lowerGET_ROUNDING(Op, DAG);
+  case ISD::PREFETCH:
+    return lowerPREFETCH(Op, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c9cc149218a99..5bc091d6e84de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -416,6 +416,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d691254..8e96d5f8abe15 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -483,6 +483,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     Offset = OffsetOp ? OffsetOp->getImm() : 0;
     // Get appropriate operand, and compute width accordingly.
     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+    if (DataOpIdx == -1)
+      return false;
     Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9362fe5d9678b..6d513fb0bdecc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3164,6 +3164,18 @@ def : GCNPat <
                         (as_i1timm $bound_ctrl))
 >;
 
+class SMPrefetchGetPcPat<string type, int cache_type> : GCNPat <
+    (prefetch (i64 imm:$offset), timm, timm, (i32 cache_type)),
+    (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) (S_ADD_U64_PSEUDO (S_GETPC_B64), $offset),
+                                                   (i32 20), (i32 SGPR_NULL), (i8 0))
+    // Offset 20 should roughly adjust getpc sequence length.
+  > {
+  let AddedComplexity = 9;
+}
+
+def : SMPrefetchGetPcPat<"INST", 0>;
+def : SMPrefetchGetPcPat<"DATA", 1>;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index c18846483cf95..a77856caae7a6 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -814,6 +814,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
   }];
 }
 
+def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+                             (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+                             [{ return !N->getOperand(1)->isDivergent();}]> {
+  let GISelPredicateCode = [{
+    return isInstrUniform(MI);
+  }];
+}
+
 def SMRDImm         : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
 def SMRDImm32       : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
 def SMRDSgpr        : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf <iPTR,
+  [{return isInt<24>(Imm);}]
+>;
+
+multiclass SMPrefetchPat<string type, int cache_type> {
+  def : GCNPat <
+    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
+    (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+    (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+    (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+    (prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+    (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0))
+  > {
+    let AddedComplexity = 10;
+  }
+}
+
+defm : SMPrefetchPat<"INST", 0>;
+defm : SMPrefetchPat<"DATA", 1>;
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
new file mode 100644
index 0000000000000..bca76770953b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-SDAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+
+; Scalar data prefetch
+
+define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data_pc_rel 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data_pc_rel 0x200, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr float, ptr addrspace(4) null, i32 128
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; Check large offsets
+
+define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_max_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], -0x800000, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_min_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_max_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data_pc_rel 0x7fffff, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_min_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data_pc_rel -0x800000, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x14, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; Check divergent address
+
+define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) {
+; GCN-LABEL: prefetch_data_vgpr:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; Check LDS and Scratch, we cannot prefetch it
+
+define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) {
+; GCN-LABEL: prefetch_data_lds:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p3(ptr addrspace(3) %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_scratch(ptr addrspace(5) inreg %ptr) {
+; GCN-LABEL: prefetch_data_scratch:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p5(ptr addrspace(5) %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; Check supported address spaces
+
+define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_flat:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_flat:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_flat:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_global:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_global:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_global:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; I$ prefetch
+
+define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_prefetch_inst_pc_rel 0x0, null, 0
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel_offset() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset:
+; GFX12-SDAG:       ; %bb.0:...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/74576


More information about the llvm-commits mailing list