[llvm] [lldb] [libc] [clang] [flang] [clang-tools-extra] [libcxx] [mlir] [compiler-rt] [lld] GFX12: Add LoopDataPrefetchPass (PR #75625)

Mon Dec 18 14:05:04 PST 2023

https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/75625

>From de5303eb8a9e061dbd365922f85cad02bca5ec26 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 5 Jul 2022 11:41:29 -0700
Subject: [PATCH 1/4] GFX12: Add LoopDataPrefetchPass

It is currently disabled by default. It will need experiments on a real
HW to tune and decide on the profitability.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   7 +
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  18 ++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   4 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   8 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 185 ++++++++++++++++++
 6 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e8c04ecf39ba02..fdc2077868cf99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -345,6 +345,11 @@ static cl::opt<bool> EnableImageIntrinsicOptimizer(
     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool>
+    EnableLoopPrefetch("amdgpu-loop-prefetch",
+                       cl::desc("Enable loop data prefetch on AMDGPU"),
+                       cl::Hidden, cl::init(false));
+
 static cl::opt<bool> EnableMaxIlpSchedStrategy(
     "amdgpu-enable-max-ilp-scheduling-strategy",
     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
@@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
 }
 
 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
+    addPass(createLoopDataPrefetchPass());
   addPass(createSeparateConstOffsetFromGEPPass());
   // ReassociateGEPs exposes more opportunities for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index f1da1a61bf4dd5..218c5b5cfdac87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1345,3 +1345,21 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+unsigned GCNTTIImpl::getPrefetchDistance() const {
+  return ST->hasPrefetch() ? 128 : 0;
+}
+
+bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
+  switch (AS) {
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1e6c5bbfc0d75b..cd8e9fd10bbf21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -254,6 +254,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
                                          FastMathFlags FMF,
                                          TTI::TargetCostKind CostKind);
+
+  /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
+  unsigned getCacheLineSize() const override { return 128; }
+
+  /// How much before a load we should place the prefetch instruction.
+  /// This is currently measured in number of IR instructions.
+  unsigned getPrefetchDistance() const override;
+
+  /// \return if target want to issue a prefetch in address space \p AS.
+  bool shouldPrefetchAddressSpace(unsigned AS) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70ef1fff274a40..717f22fb69fdd3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
     return false;
 
+  // A mayLoad instruction without a def is not a load. Likely a prefetch.
+  if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
+    return false;
+
   if (isDS(Opc0) && isDS(Opc1)) {
 
     // FIXME: Handle this case:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 488dbe2e3189bf..8b0b6263832243 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
 ; RUN:   | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O1 %s
 ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \
-; RUN:   -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \
+; RUN:   -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -amdgpu-loop-prefetch -debug-pass=Structure < %s 2>&1 \
 ; RUN:   | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O1-OPTS %s
 ; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
 ; RUN:   | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O2 %s
@@ -461,6 +461,12 @@
 ; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      Canonicalize natural loops
+; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
+; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
+; GCN-O1-OPTS-NEXT:      Optimization Remark Emitter
+; GCN-O1-OPTS-NEXT:      Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT:      Loop Data Prefetch
 ; GCN-O1-OPTS-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O1-OPTS-NEXT:      Scalar Evolution Analysis
 ; GCN-O1-OPTS-NEXT:      Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
new file mode 100644
index 00000000000000..fb3c04235b8e4d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s
+
+define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
+; GCN-LABEL: copy_flat:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
+; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT:    .p2align 6
+; GCN-NEXT:  .LBB0_2: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
+; GCN-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GCN-NEXT:    s_add_co_i32 s4, s4, -1
+; GCN-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
+; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    flat_store_b128 v[4:5], v[0:3]
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
+; GCN-NEXT:  .LBB0_3: ; %for.end
+; GCN-NEXT:    s_endpgm
+entry:
+  %cmp6.not = icmp eq i32 %n, 0
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = zext i32 %i.07 to i64
+  %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
+  %ld = load <4 x i32>, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
+  store <4 x i32> %ld, ptr %arrayidx2, align 4
+  %inc = add nuw i32 %i.07, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
+; GCN-LABEL: copy_global:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB1_3
+; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT:  .LBB1_2: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    global_load_b128 v[1:4], v0, s[2:3] offset:-176
+; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
+; GCN-NEXT:    s_add_co_i32 s4, s4, -1
+; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
+; GCN-NEXT:  .LBB1_3: ; %for.end
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT:    s_endpgm
+entry:
+  %cmp6.not = icmp eq i32 %n, 0
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = zext i32 %i.07 to i64
+  %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
+  %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
+  store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
+  %inc = add nuw i32 %i.07, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
+; GCN-LABEL: copy_constant:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
+; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:  .LBB2_2: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
+; GCN-NEXT:    s_prefetch_data s[2:3], 0xb0, null, 0
+; GCN-NEXT:    s_add_co_i32 s4, s4, -1
+; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
+; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
+; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
+; GCN-NEXT:  .LBB2_3: ; %for.end
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT:    s_endpgm
+entry:
+  %cmp6.not = icmp eq i32 %n, 0
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = zext i32 %i.07 to i64
+  %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
+  %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
+  store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
+  %inc = add nuw i32 %i.07, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
+; GCN-LABEL: copy_local:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
+; GCN-NEXT:  .LBB3_1: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_add_co_i32 s2, s2, -1
+; GCN-NEXT:    s_add_co_i32 s0, s0, 16
+; GCN-NEXT:    s_add_co_i32 s1, s1, 16
+; GCN-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
+; GCN-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
+; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
+; GCN-NEXT:    s_cbranch_scc1 .LBB3_1
+; GCN-NEXT:  .LBB3_2: ; %for.end
+; GCN-NEXT:    s_endpgm
+entry:
+  %cmp6.not = icmp eq i32 %n, 0
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = zext i32 %i.07 to i64
+  %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
+  %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
+  store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
+  %inc = add nuw i32 %i.07, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}

>From 8f41e750a437c655b6f6e08fa230d6eb02557cd8 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Mon, 18 Dec 2023 09:21:06 +0100
Subject: [PATCH 2/4] Use isFlatGlobalAddrSpace

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 218c5b5cfdac87..ebe0b8551b236a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1351,15 +1351,5 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
 }
 
 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
-  switch (AS) {
-  case AMDGPUAS::FLAT_ADDRESS:
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
+  return AMDGPU::isFlatGlobalAddrSpace(AS);
 }

>From 6296613d1d3f3e8b63e1af1d12d28f2000facc0e Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Mon, 18 Dec 2023 22:47:50 +0100
Subject: [PATCH 3/4] remove

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ebe0b8551b236a..c25ccd9eccb68d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1351,5 +1351,5 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
 }
 
 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
-  return AMDGPU::isFlatGlobalAddrSpace(AS);
+  return     AMDGPU::isFlatGlobalAddrSpace(AS);
 }

>From 08f178e91a949cccd05aa5cdab13f7a8e86dee7e Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Mon, 18 Dec 2023 23:04:29 +0100
Subject: [PATCH 4/4] fixup for clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c25ccd9eccb68d..ebe0b8551b236a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1351,5 +1351,5 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
 }
 
 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
-  return     AMDGPU::isFlatGlobalAddrSpace(AS);
+  return AMDGPU::isFlatGlobalAddrSpace(AS);
 }