[llvm] [DirectX] Mark buffer load/store as mem read/write (PR #114105)

Greg Roth via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 30 16:55:01 PDT 2024


https://github.com/pow2clk updated https://github.com/llvm/llvm-project/pull/114105

>From 3775e77b74bd6dbbc0e79ea499a38c67fa08ec3d Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth at microsoft.com>
Date: Tue, 29 Oct 2024 11:39:28 -0600
Subject: [PATCH 1/2] [DirectX] Mark buffer load/store as mem read/write

By giving these intrinsics their appropriate attributes, loads of allocas that are stored on the other side of these calls can be eliminated.

Adds a test that verifies that the unneeded loads can be eliminated and also that the attributes are set properly.

Fixes #104271

This may be the first part of a broader audit of
---
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  6 +-
 .../DirectX/ResourceGlobalElimination.ll      | 85 +++++++++++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index e30d37f69f781e..88c17a3378cda5 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -28,12 +28,12 @@ def int_dx_handle_fromBinding
           [IntrNoMem]>;
 
 def int_dx_typedBufferLoad
-    : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
+    : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>;
 def int_dx_typedBufferLoad_checkbit
     : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
-                            [llvm_any_ty, llvm_i32_ty]>;
+                            [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>;
 def int_dx_typedBufferStore
-    : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>;
+    : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty], [IntrWriteMem]>;
 
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
new file mode 100644
index 00000000000000..c7ba9377911008
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
@@ -0,0 +1,85 @@
+; RUN: opt -S -passes='early-cse<memssa>' %s | FileCheck %s
+
+; Ensure that EarlyCSE is able to eliminate unneeded loads of resource globals across typedBufferLoad.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxilv1.6-unknown-shadermodel6.6-compute"
+
+%"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) }
+
+ at In = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
+ at Out = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
+
+; Function Attrs: convergent noinline norecurse
+; CHECK-LABEL define void @main()
+define void @main() local_unnamed_addr #0 {
+entry:
+  %tmp = alloca target("dx.TypedBuffer", <4 x float>, 1, 0, 0), align 4
+  %In_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
+  store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %In_h.i, ptr @In, align 4
+  %Out_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 4, i32 1, i32 1, i32 0, i1 false)
+  store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, ptr @Out, align 4
+  ; CHECK: call i32 @llvm.dx.flattened.thread.id.in.group()
+  %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
+  ; CHECK-NOT: load {{.*}} ptr @In
+  %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
+  ; CHECK call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
+  %2 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  ; CHECK-NOT: load {{.*}} ptr @In
+  %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
+  %4 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %add.i = fadd <4 x float> %2, %4
+  store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, ptr %tmp, align 4
+  call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
+  ret void
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.dx.flattened.thread.id.in.group() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn
+; CHECK: declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
+declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn
+; CHECK: declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
+declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32, i32, i32, i32, i1) #3
+
+; CHECK: attributes [[ROAttr]] = { {{.*}} memory(read) }
+; CHECK: attributes [[WOAttr]] = { {{.*}} memory(write) }
+
+attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1}
+!dx.valver = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 8}
+!3 = !{!"clang version 20.0.0git (git at github.com:llvm/llvm-project.git 54dc966bd3d375d7c1604fac5fdac20989c1072a)"}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"_ZN4hlsl8RWBufferIDv4_fEixEi: %agg.result"}
+!6 = distinct !{!6, !"_ZN4hlsl8RWBufferIDv4_fEixEi"}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTSN4hlsl8RWBufferIDv4_fEE", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!11 = !{!12}
+!12 = distinct !{!12, !13, !"_ZN4hlsl8RWBufferIDv4_fEixEi: %agg.result"}
+!13 = distinct !{!13, !"_ZN4hlsl8RWBufferIDv4_fEixEi"}
+!14 = !{!15}
+!15 = distinct !{!15, !16, !"_ZN4hlsl8RWBufferIDv4_fEixEi: %agg.result"}
+!16 = distinct !{!16, !"_ZN4hlsl8RWBufferIDv4_fEixEi"}
+!17 = !{!18, !9, i64 0}
+!18 = !{!"_ZTSN4hlsl8__detail18TypedResourceProxyIU9_Res_u_CTDv4_fu17__hlsl_resource_tS2_EE", !9, i64 0, !19, i64 4}
+!19 = !{!"int", !9, i64 0}
+!20 = !{!21}
+!21 = distinct !{!21, !22, !"_ZN4hlsl8__detail18TypedResourceProxyIU9_Res_u_CTDv4_fu17__hlsl_resource_tS2_EaSES2_: %agg.result"}
+!22 = distinct !{!22, !"_ZN4hlsl8__detail18TypedResourceProxyIU9_Res_u_CTDv4_fu17__hlsl_resource_tS2_EaSES2_"}

>From d013656cfcc9ababfe0707a9cf7157ebe1e3d191 Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth at microsoft.com>
Date: Wed, 30 Oct 2024 17:54:18 -0600
Subject: [PATCH 2/2] [DirectX] Eliminate resource global variables from module

By giving these intrinsics their appropriate attributes, loads of globals that are stored on the other side of these calls can be eliminated by the EarlyCSE pass.  Stores to the same globals and the globals themselves require more direct intervention as part of the handleFromBinding lowering.

Adds a test that verifies that the unneeded globals and their uses can be eliminated and also that the attributes are set properly.

Fixes #104271
---
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    | 20 ++++++++++++++
 .../DirectX/ResourceGlobalElimination.ll      | 26 ++++++++++++-------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 8acc9c1efa08c0..34b2253a529828 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -204,6 +204,22 @@ class OpLowerer {
     CleanupCasts.clear();
   }
 
+  // Remove the resource global associated with the handleFromBinding call instruction
+  // and their uses as they aren't needed anymore.
+  void removeResourceGlobals(CallInst *CI) {
+    for (User *User : make_early_inc_range(CI->users())) {
+      if(StoreInst *Store = dyn_cast<StoreInst>(User)) {
+	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Store->getOperand(1))) {
+	  Store->eraseFromParent();
+	  assert(GV->use_empty() && "Buffer global still has users");
+	  GV->removeDeadConstantUsers();
+	  GV->eraseFromParent();
+	}
+      }
+    }
+  }
+
+
   [[nodiscard]] bool lowerToCreateHandle(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int8Ty = IRB.getInt8Ty();
@@ -228,6 +244,8 @@ class OpLowerer {
 
       Value *Cast = createTmpHandleCast(*OpCall, CI->getType());
 
+      removeResourceGlobals(CI);
+
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
       return Error::success();
@@ -272,6 +290,8 @@ class OpLowerer {
 
       Value *Cast = createTmpHandleCast(*OpAnnotate, CI->getType());
 
+      removeResourceGlobals(CI);
+
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
 
diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
index c7ba9377911008..1d1c632244b3ba 100644
--- a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
@@ -1,12 +1,19 @@
-; RUN: opt -S -passes='early-cse<memssa>' %s | FileCheck %s
+; RUN: opt -S -passes='early-cse<memssa>' %s -o %t
+; RUN: FileCheck --check-prefixes=CSE,CHECK %s < %t
+; finish compiling to verify that dxil-op-lower removes the globals entirely
+; RUN: llc -mtriple=dxil-pc-shadermodel6.0-compute  --filetype=asm -o - %t | FileCheck --check-prefixes=LLC,CHECK %s
+; RUN: llc -mtriple=dxil-pc-shadermodel6.6-compute  --filetype=asm -o - %t | FileCheck --check-prefixes=LLC,CHECK %s
 
 ; Ensure that EarlyCSE is able to eliminate unneeded loads of resource globals across typedBufferLoad.
+; Also that DXILOpLowering eliminates the globals entirely.
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxilv1.6-unknown-shadermodel6.6-compute"
 
 %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) }
 
+; LLC-NOT: @In = global
+; LLC-NOT: @Out = global
 @In = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
 @Out = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
 
@@ -14,23 +21,24 @@ target triple = "dxilv1.6-unknown-shadermodel6.6-compute"
 ; CHECK-LABEL define void @main()
 define void @main() local_unnamed_addr #0 {
 entry:
-  %tmp = alloca target("dx.TypedBuffer", <4 x float>, 1, 0, 0), align 4
+  ; LLC: %In_h.i1 = call %dx.types.Handle @dx.op.createHandle
+  ; LLC: %Out_h.i2 = call %dx.types.Handle @dx.op.createHandle
   %In_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
   store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %In_h.i, ptr @In, align 4
   %Out_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 4, i32 1, i32 1, i32 0, i1 false)
   store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, ptr @Out, align 4
-  ; CHECK: call i32 @llvm.dx.flattened.thread.id.in.group()
+  ; CSE: call i32 @llvm.dx.flattened.thread.id.in.group()
   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
   ; CHECK-NOT: load {{.*}} ptr @In
   %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  ; CHECK call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
+  ; CSE: call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
   %2 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
   ; CHECK-NOT: load {{.*}} ptr @In
   %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
   %4 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
   %add.i = fadd <4 x float> %2, %4
-  store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, ptr %tmp, align 4
   call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
+  ; CHECK: ret void
   ret void
 }
 
@@ -38,18 +46,18 @@ entry:
 declare i32 @llvm.dx.flattened.thread.id.in.group() #1
 
 ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn
-; CHECK: declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
+; CSE: declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
 declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) #2
 
 ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn
-; CHECK: declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
+; CSE: declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
 declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) #2
 
 ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
 declare target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32, i32, i32, i32, i1) #3
 
-; CHECK: attributes [[ROAttr]] = { {{.*}} memory(read) }
-; CHECK: attributes [[WOAttr]] = { {{.*}} memory(write) }
+; CSE: attributes [[ROAttr]] = { {{.*}} memory(read) }
+; CSE: attributes [[WOAttr]] = { {{.*}} memory(write) }
 
 attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) }



More information about the llvm-commits mailing list