[llvm] e9de91e - [AMDGPU] Add safe-smem-prefetch SubtargetFeature off by default (#130050)

via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 7 02:10:24 PST 2025


Author: Mariusz Sikora
Date: 2025-03-07T11:10:21+01:00
New Revision: e9de91e989de2b61ded7f471b48453eddf77ca29

URL: https://github.com/llvm/llvm-project/commit/e9de91e989de2b61ded7f471b48453eddf77ca29
DIFF: https://github.com/llvm/llvm-project/commit/e9de91e989de2b61ded7f471b48453eddf77ca29.diff

LOG: [AMDGPU] Add safe-smem-prefetch SubtargetFeature off by default (#130050)

S_PREFETCH_* instructions may cause host to terminate process in case of
the invalid address.

Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
    llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 22b519898f6bd..1c8dc09d3060b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -244,6 +244,12 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
   "S_INST_PREFETCH instruction causes shader to hang"
 >;
 
+def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
+  "HasSafeSmemPrefetch",
+  "true",
+  "SMEM prefetches do not fail on illegal address"
+>;
+
 def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
   "HasVcmpxExecWARHazard",
   "true",

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index efe92e0fecc12..c19ee14ab1574 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3460,7 +3460,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     applyMappingMAD_64_32(B, OpdMapper);
     return;
   case AMDGPU::G_PREFETCH: {
-    if (!Subtarget.hasPrefetch()) {
+    if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
       MI.eraseFromParent();
       return;
     }

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e6f02a4eeaac8..f7f03fe5911bd 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -233,6 +233,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasVMEMtoScalarWriteHazard = false;
   bool HasSMEMtoVectorWriteHazard = false;
   bool HasInstFwdPrefetchBug = false;
+  bool HasSafeSmemPrefetch = false;
   bool HasVcmpxExecWARHazard = false;
   bool HasLdsBranchVmemWARHazard = false;
   bool HasNSAtoVMEMBug = false;
@@ -963,6 +964,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasPrefetch() const { return GFX12Insts; }
 
+  bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+
   // Has s_cmpk_* instructions.
   bool hasSCmpK() const { return getGeneration() < GFX12; }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f930b5eac6953..0d100aaf4412d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -855,7 +855,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMad64_32())
     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
 
-  if (Subtarget->hasPrefetch())
+  if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
     setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
   if (Subtarget->hasIEEEMinMax()) {

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index 2b517736ecff3..72260e0b99715 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -1,34 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12,GFX12-GISEL %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
 
 ; Scalar data prefetch
 
 define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_data_sgpr:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr_offset:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_data_sgpr_offset:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -38,14 +40,14 @@ entry:
 ; Check large offsets
 
 define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr_max_offset:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_max_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -53,25 +55,25 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_mov_b32 s2, 0xff800000
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_min_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_min_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; SPREFETCH-SDAG-NEXT:    s_mov_b32 s2, 0xff800000
+; SPREFETCH-SDAG-NEXT:    s_mov_b32 s3, -1
+; SPREFETCH-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SPREFETCH-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-GISEL-NEXT:    s_endpgm
+; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
+; SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; SPREFETCH-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -79,22 +81,22 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x800000
-; GFX12-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_too_large_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_too_large_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
+; SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x800000
+; SPREFETCH-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
-; GFX12-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-GISEL-NEXT:    s_endpgm
+; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
+; SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
+; SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; SPREFETCH-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -135,43 +137,43 @@ entry:
 ; Check supported address spaces
 
 define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr_flat:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_flat:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_data_sgpr_flat:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr_global:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_global:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_data_sgpr_global:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
-; GFX12-LABEL: prefetch_data_sgpr_constant_32bit:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_data_sgpr_constant_32bit:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_mov_b32 s1, 0
+; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -180,28 +182,28 @@ entry:
 ; I$ prefetch
 
 define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_inst_sgpr:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_inst_sgpr:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_inst_sgpr:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
   ret void
 }
 
 define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_inst_sgpr_offset:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_inst_sgpr_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_inst_sgpr_offset:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -211,14 +213,14 @@ entry:
 ; Check large offsets
 
 define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: prefetch_inst_sgpr_max_offset:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
-; GFX12-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_inst_sgpr_max_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
+; SPREFETCH:       ; %bb.0: ; %entry
+; SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
+; SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -226,25 +228,25 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_mov_b32 s2, 0xff800000
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_min_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_inst_sgpr_min_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
+; SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; SPREFETCH-SDAG-NEXT:    s_mov_b32 s2, 0xff800000
+; SPREFETCH-SDAG-NEXT:    s_mov_b32 s3, -1
+; SPREFETCH-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SPREFETCH-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; SPREFETCH-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; GFX12-GISEL-NEXT:    s_endpgm
+; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
+; SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
+; SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; SPREFETCH-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; SPREFETCH-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -252,22 +254,22 @@ entry:
 }
 
 define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
-; GFX12-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x800000
-; GFX12-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; GFX12-SDAG-NEXT:    s_endpgm
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_too_large_offset:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: prefetch_inst_sgpr_too_large_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_endpgm
+; SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
+; SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x800000
+; SPREFETCH-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; SPREFETCH-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
-; GFX12-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; GFX12-GISEL-NEXT:    s_endpgm
+; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
+; SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
+; SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; SPREFETCH-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; SPREFETCH-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index dd77e575e7505..874dece6b728d 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -1,33 +1,60 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
 
 define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
-; GCN-LABEL: copy_flat:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_cmp_eq_u32 s6, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
-; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
-; GCN-NEXT:  .LBB0_2: ; %for.body
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GCN-NEXT:    s_add_co_i32 s6, s6, -1
-; GCN-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
-; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    flat_store_b128 v[4:5], v[0:3]
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
-; GCN-NEXT:  .LBB0_3: ; %for.end
-; GCN-NEXT:    s_endpgm
+; GFX12-LABEL: copy_flat:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB0_3
+; GFX12-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX12-NEXT:  .LBB0_2: ; %for.body
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    s_add_co_i32 s6, s6, -1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b128 v[4:5], v[0:3]
+; GFX12-NEXT:    s_cbranch_scc1 .LBB0_2
+; GFX12-NEXT:  .LBB0_3: ; %for.end
+; GFX12-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_flat:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB0_3
+; GFX12-SPREFETCH-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX12-SPREFETCH-NEXT:  .LBB0_2: ; %for.body
+; GFX12-SPREFETCH-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT:    s_wait_alu 0xfffe
+; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s6, s6, -1
+; GFX12-SPREFETCH-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-SPREFETCH-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-SPREFETCH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SPREFETCH-NEXT:    flat_store_b128 v[4:5], v[0:3]
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB0_2
+; GFX12-SPREFETCH-NEXT:  .LBB0_3: ; %for.end
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %cmp6.not = icmp eq i32 %n, 0
   br i1 %cmp6.not, label %for.end, label %for.body
@@ -48,30 +75,54 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
-; GCN-LABEL: copy_global:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_cmp_eq_u32 s6, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB1_3
-; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
-; GCN-NEXT:  .LBB1_2: ; %for.body
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    global_load_b128 v[1:4], v0, s[2:3] offset:-176
-; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT:    s_add_co_i32 s6, s6, -1
-; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
-; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
-; GCN-NEXT:  .LBB1_3: ; %for.end
-; GCN-NEXT:    s_endpgm
+; GFX12-LABEL: copy_global:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB1_3
+; GFX12-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX12-NEXT:  .LBB1_2: ; %for.body
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    global_load_b128 v[1:4], v0, s[2:3] offset:-176
+; GFX12-NEXT:    s_add_co_i32 s6, s6, -1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX12-NEXT:  .LBB1_3: ; %for.end
+; GFX12-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_global:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB1_3
+; GFX12-SPREFETCH-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX12-SPREFETCH-NEXT:  .LBB1_2: ; %for.body
+; GFX12-SPREFETCH-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT:    global_load_b128 v[1:4], v0, s[2:3] offset:-176
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s6, s6, -1
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-SPREFETCH-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-SPREFETCH-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SPREFETCH-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX12-SPREFETCH-NEXT:  .LBB1_3: ; %for.end
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %cmp6.not = icmp eq i32 %n, 0
   br i1 %cmp6.not, label %for.end, label %for.body
@@ -92,31 +143,56 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
-; GCN-LABEL: copy_constant:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_cmp_eq_u32 s6, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
-; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:  .LBB2_2: ; %for.body
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
-; GCN-NEXT:    s_prefetch_data s[2:3], 0xb0, null, 0
-; GCN-NEXT:    s_add_co_i32 s6, s6, -1
-; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
-; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
-; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
-; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
-; GCN-NEXT:  .LBB2_3: ; %for.end
-; GCN-NEXT:    s_endpgm
+; GFX12-LABEL: copy_constant:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_3
+; GFX12-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:  .LBB2_2: ; %for.body
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s6, s6, -1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
+; GFX12-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX12-NEXT:  .LBB2_3: ; %for.end
+; GFX12-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_constant:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB2_3
+; GFX12-SPREFETCH-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SPREFETCH-NEXT:  .LBB2_2: ; %for.body
+; GFX12-SPREFETCH-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[2:3], 0xb0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s6, s6, -1
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-SPREFETCH-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
+; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
+; GFX12-SPREFETCH-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX12-SPREFETCH-NEXT:  .LBB2_3: ; %for.end
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %cmp6.not = icmp eq i32 %n, 0
   br i1 %cmp6.not, label %for.end, label %for.body
@@ -137,30 +213,55 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
-; GCN-LABEL: copy_local:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_cmp_eq_u32 s2, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
-; GCN-NEXT:  .LBB3_1: ; %for.body
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-NEXT:    s_add_co_i32 s2, s2, -1
-; GCN-NEXT:    s_add_co_i32 s0, s0, 16
-; GCN-NEXT:    s_add_co_i32 s1, s1, 16
-; GCN-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
-; GCN-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_wait_dscnt 0x1
-; GCN-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
-; GCN-NEXT:    s_wait_dscnt 0x1
-; GCN-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
-; GCN-NEXT:    s_cbranch_scc1 .LBB3_1
-; GCN-NEXT:  .LBB3_2: ; %for.end
-; GCN-NEXT:    s_endpgm
+; GFX12-LABEL: copy_local:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX12-NEXT:  .LBB3_1: ; %for.body
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 16
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 16
+; GFX12-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
+; GFX12-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_wait_dscnt 0x1
+; GFX12-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
+; GFX12-NEXT:    s_wait_dscnt 0x1
+; GFX12-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX12-NEXT:  .LBB3_2: ; %for.end
+; GFX12-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_local:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX12-SPREFETCH-NEXT:  .LBB3_1: ; %for.body
+; GFX12-SPREFETCH-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT:    s_wait_alu 0xfffe
+; GFX12-SPREFETCH-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SPREFETCH-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s2, s2, -1
+; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s0, s0, 16
+; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s1, s1, 16
+; GFX12-SPREFETCH-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
+; GFX12-SPREFETCH-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
+; GFX12-SPREFETCH-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-SPREFETCH-NEXT:    s_wait_dscnt 0x1
+; GFX12-SPREFETCH-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
+; GFX12-SPREFETCH-NEXT:    s_wait_dscnt 0x1
+; GFX12-SPREFETCH-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
+; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX12-SPREFETCH-NEXT:  .LBB3_2: ; %for.end
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %cmp6.not = icmp eq i32 %n, 0
   br i1 %cmp6.not, label %for.end, label %for.body


        


More information about the llvm-commits mailing list